diff --git a/.github/workflows/sdks-tests.yml b/.github/workflows/sdks-tests.yml index 61b33e2fc..fc9e5770e 100644 --- a/.github/workflows/sdks-tests.yml +++ b/.github/workflows/sdks-tests.yml @@ -52,7 +52,7 @@ jobs: - name: Setup .NET Core uses: actions/setup-dotnet@v4 with: - dotnet-version: "6.0.x" + dotnet-version: "8.0.x" - name: Install dependencies run: dotnet restore - name: Build diff --git a/crates/dump/README.md b/crates/dump/README.md index 3537f188e..42d84ec80 100644 --- a/crates/dump/README.md +++ b/crates/dump/README.md @@ -10,8 +10,10 @@ dump ├── instance-uid.uuid ├── keys.jsonl ├── metadata.json -└── tasks - ├── update_files - │ └── [task_id].jsonl +├── tasks +│ ├── update_files +│ │ └── [task_id].jsonl +│ └── queue.jsonl +└── batches └── queue.jsonl -``` \ No newline at end of file +``` diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 0fb5570b0..905a6485d 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -228,14 +228,16 @@ pub(crate) mod test { use big_s::S; use maplit::{btreemap, btreeset}; + use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats}; use meilisearch_types::facet_values_sort::FacetValuesSort; - use meilisearch_types::features::RuntimeTogglableFeatures; + use meilisearch_types::features::{Network, Remote, RuntimeTogglableFeatures}; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, Key}; use meilisearch_types::milli; use meilisearch_types::milli::update::Setting; use meilisearch_types::settings::{Checked, FacetingSettings, Settings}; - use meilisearch_types::tasks::{Details, Status}; + use meilisearch_types::task_view::DetailsView; + use meilisearch_types::tasks::{Details, Kind, Status}; use serde_json::{json, Map, Value}; use time::macros::datetime; use uuid::Uuid; @@ -305,6 +307,30 @@ pub(crate) mod test { settings.check() } + pub fn create_test_batches() -> Vec { + vec![Batch { + uid: 0, + details: DetailsView { + received_documents: Some(12), + indexed_documents: Some(Some(10)), + ..DetailsView::default() + }, + progress: None, + stats: BatchStats { + total_nb_tasks: 1, + status: maplit::btreemap! { Status::Succeeded => 1 }, + types: maplit::btreemap! { Kind::DocumentAdditionOrUpdate => 1 }, + index_uids: maplit::btreemap! { "doggo".to_string() => 1 }, + }, + enqueued_at: Some(BatchEnqueuedAt { + earliest: datetime!(2022-11-11 0:00 UTC), + oldest: datetime!(2022-11-11 0:00 UTC), + }), + started_at: datetime!(2022-11-20 0:00 UTC), + finished_at: Some(datetime!(2022-11-21 0:00 UTC)), + }] + } + pub fn create_test_tasks() -> Vec<(TaskDump, Option>)> { vec![ ( @@ -427,6 +453,15 @@ pub(crate) mod test { index.flush().unwrap(); index.settings(&settings).unwrap(); + // ========== pushing the batch queue + let batches = create_test_batches(); + + let mut batch_queue = dump.create_batches_queue().unwrap(); + for batch in &batches { + batch_queue.push_batch(batch).unwrap(); + } + batch_queue.flush().unwrap(); + // ========== pushing the task queue let tasks = create_test_tasks(); @@ -455,6 +490,10 @@ pub(crate) mod test { dump.create_experimental_features(features).unwrap(); + // ========== network + let network = create_test_network(); + dump.create_network(network).unwrap(); + // create the dump let mut file = tempfile::tempfile().unwrap(); dump.persist_to(&mut file).unwrap(); @@ -467,6 +506,13 @@ pub(crate) mod test { RuntimeTogglableFeatures::default() } + fn create_test_network() -> Network { + Network { + local: Some("myself".to_string()), + remotes: maplit::btreemap! {"other".to_string() => Remote { url: "http://test".to_string(), search_api_key: Some("apiKey".to_string()) }}, + } + } + #[test] fn test_creating_and_read_dump() { let mut file = create_test_dump(); @@ -515,5 +561,9 @@ pub(crate) mod test { // ==== checking the features let expected = create_test_features(); assert_eq!(dump.features().unwrap().unwrap(), expected); + + // ==== checking the network + let expected = create_test_network(); + assert_eq!(&expected, dump.network().unwrap().unwrap()); } } diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 6b2655bdf..2dd4ed761 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -196,6 +196,10 @@ impl CompatV5ToV6 { pub fn features(&self) -> Result> { Ok(None) } + + pub fn network(&self) -> Result> { + Ok(None) + } } pub enum CompatIndexV5ToV6 { diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 151267378..2b4440ab7 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -23,6 +23,7 @@ mod v6; pub type Document = serde_json::Map; pub type UpdateFile = dyn Iterator>; +#[allow(clippy::large_enum_variant)] pub enum DumpReader { Current(V6Reader), Compat(CompatV5ToV6), @@ -101,6 +102,13 @@ impl DumpReader { } } + pub fn batches(&mut self) -> Result> + '_>> { + match self { + DumpReader::Current(current) => Ok(current.batches()), + DumpReader::Compat(_compat) => Ok(Box::new(std::iter::empty())), + } + } + pub fn keys(&mut self) -> Result> + '_>> { match self { DumpReader::Current(current) => Ok(current.keys()), @@ -114,6 +122,13 @@ impl DumpReader { DumpReader::Compat(compat) => compat.features(), } } + + pub fn network(&self) -> Result> { + match self { + DumpReader::Current(current) => Ok(current.network()), + DumpReader::Compat(compat) => compat.network(), + } + } } impl From for DumpReader { @@ -219,6 +234,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00"); insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -328,6 +347,7 @@ pub(crate) mod test { } assert_eq!(dump.features().unwrap().unwrap(), RuntimeTogglableFeatures::default()); + assert_eq!(dump.network().unwrap(), None); } #[test] @@ -339,6 +359,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2023-07-06 7:10:27.21958 +00:00:00"); insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -373,6 +397,27 @@ pub(crate) mod test { assert_eq!(dump.features().unwrap().unwrap(), RuntimeTogglableFeatures::default()); } + #[test] + fn import_dump_v6_network() { + let dump = File::open("tests/assets/v6-with-network.dump").unwrap(); + let dump = DumpReader::open(dump).unwrap(); + + // top level infos + insta::assert_snapshot!(dump.date().unwrap(), @"2025-01-29 15:45:32.738676 +00:00:00"); + insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + + // network + + let network = dump.network().unwrap().unwrap(); + insta::assert_snapshot!(network.local.as_ref().unwrap(), @"ms-0"); + insta::assert_snapshot!(network.remotes.get("ms-0").as_ref().unwrap().url, @"http://localhost:7700"); + insta::assert_snapshot!(network.remotes.get("ms-0").as_ref().unwrap().search_api_key.is_none(), @"true"); + insta::assert_snapshot!(network.remotes.get("ms-1").as_ref().unwrap().url, @"http://localhost:7701"); + insta::assert_snapshot!(network.remotes.get("ms-1").as_ref().unwrap().search_api_key.is_none(), @"true"); + insta::assert_snapshot!(network.remotes.get("ms-2").as_ref().unwrap().url, @"http://ms-5679.example.meilisearch.io"); + insta::assert_snapshot!(network.remotes.get("ms-2").as_ref().unwrap().search_api_key.as_ref().unwrap(), @"foo"); + } + #[test] fn import_dump_v5() { let dump = File::open("tests/assets/v5.dump").unwrap(); @@ -382,6 +427,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-04 15:55:10.344982459 +00:00:00"); insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -462,6 +511,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-06 12:53:49.131989609 +00:00:00"); insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -539,6 +592,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-07 11:39:03.709153554 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -632,6 +689,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-09 20:27:59.904096267 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -725,6 +786,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2023-01-30 16:26:09.247261 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -801,6 +866,10 @@ pub(crate) mod test { assert_eq!(dump.date(), None); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 50b9751a2..9e0d07c78 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -18,8 +18,10 @@ pub type Checked = meilisearch_types::settings::Checked; pub type Unchecked = meilisearch_types::settings::Unchecked; pub type Task = crate::TaskDump; +pub type Batch = meilisearch_types::batches::Batch; pub type Key = meilisearch_types::keys::Key; pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures; +pub type Network = meilisearch_types::features::Network; // ===== Other types to clarify the code of the compat module // everything related to the tasks @@ -48,8 +50,10 @@ pub struct V6Reader { instance_uid: Option, metadata: Metadata, tasks: BufReader, + batches: Option>, keys: BufReader, features: Option, + network: Option, } impl V6Reader { @@ -77,13 +81,38 @@ impl V6Reader { } else { None }; + let batches = match File::open(dump.path().join("batches").join("queue.jsonl")) { + Ok(file) => Some(BufReader::new(file)), + // The batch file was only introduced during the v1.13, anything prior to that won't have batches + Err(err) if err.kind() == ErrorKind::NotFound => None, + Err(e) => return Err(e.into()), + }; + + let network_file = match fs::read(dump.path().join("network.json")) { + Ok(network_file) => Some(network_file), + Err(error) => match error.kind() { + // Allows the file to be missing, this will only result in all experimental features disabled. + ErrorKind::NotFound => { + debug!("`network.json` not found in dump"); + None + } + _ => return Err(error.into()), + }, + }; + let network = if let Some(network_file) = network_file { + Some(serde_json::from_reader(&*network_file)?) + } else { + None + }; Ok(V6Reader { metadata: serde_json::from_reader(&*meta_file)?, instance_uid, tasks: BufReader::new(File::open(dump.path().join("tasks").join("queue.jsonl"))?), + batches, keys: BufReader::new(File::open(dump.path().join("keys.jsonl"))?), features, + network, dump, }) } @@ -124,7 +153,7 @@ impl V6Reader { &mut self, ) -> Box>)>> + '_> { Box::new((&mut self.tasks).lines().map(|line| -> Result<_> { - let task: Task = serde_json::from_str(&line?).unwrap(); + let task: Task = serde_json::from_str(&line?)?; let update_file_path = self .dump @@ -136,8 +165,7 @@ impl V6Reader { if update_file_path.exists() { Ok(( task, - Some(Box::new(UpdateFile::new(&update_file_path).unwrap()) - as Box), + Some(Box::new(UpdateFile::new(&update_file_path)?) as Box), )) } else { Ok((task, None)) @@ -145,6 +173,16 @@ impl V6Reader { })) } + pub fn batches(&mut self) -> Box> + '_> { + match self.batches.as_mut() { + Some(batches) => Box::new((batches).lines().map(|line| -> Result<_> { + let batch = serde_json::from_str(&line?)?; + Ok(batch) + })), + None => Box::new(std::iter::empty()) as Box> + '_>, + } + } + pub fn keys(&mut self) -> Box> + '_> { Box::new( (&mut self.keys).lines().map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }), @@ -154,6 +192,10 @@ impl V6Reader { pub fn features(&self) -> Option { self.features } + + pub fn network(&self) -> Option<&Network> { + self.network.as_ref() + } } pub struct UpdateFile { diff --git a/crates/dump/src/writer.rs b/crates/dump/src/writer.rs index 3ee51cabf..63b006b5c 100644 --- a/crates/dump/src/writer.rs +++ b/crates/dump/src/writer.rs @@ -4,7 +4,8 @@ use std::path::PathBuf; use flate2::write::GzEncoder; use flate2::Compression; -use meilisearch_types::features::RuntimeTogglableFeatures; +use meilisearch_types::batches::Batch; +use meilisearch_types::features::{Network, RuntimeTogglableFeatures}; use meilisearch_types::keys::Key; use meilisearch_types::settings::{Checked, Settings}; use serde_json::{Map, Value}; @@ -54,6 +55,10 @@ impl DumpWriter { TaskWriter::new(self.dir.path().join("tasks")) } + pub fn create_batches_queue(&self) -> Result { + BatchWriter::new(self.dir.path().join("batches")) + } + pub fn create_experimental_features(&self, features: RuntimeTogglableFeatures) -> Result<()> { Ok(std::fs::write( self.dir.path().join("experimental-features.json"), @@ -61,6 +66,10 @@ impl DumpWriter { )?) } + pub fn create_network(&self, network: Network) -> Result<()> { + Ok(std::fs::write(self.dir.path().join("network.json"), serde_json::to_string(&network)?)?) + } + pub fn persist_to(self, mut writer: impl Write) -> Result<()> { let gz_encoder = GzEncoder::new(&mut writer, Compression::default()); let mut tar_encoder = tar::Builder::new(gz_encoder); @@ -84,7 +93,7 @@ impl KeyWriter { } pub fn push_key(&mut self, key: &Key) -> Result<()> { - self.keys.write_all(&serde_json::to_vec(key)?)?; + serde_json::to_writer(&mut self.keys, &key)?; self.keys.write_all(b"\n")?; Ok(()) } @@ -114,7 +123,7 @@ impl TaskWriter { /// Pushes tasks in the dump. /// If the tasks has an associated `update_file` it'll use the `task_id` as its name. pub fn push_task(&mut self, task: &TaskDump) -> Result { - self.queue.write_all(&serde_json::to_vec(task)?)?; + serde_json::to_writer(&mut self.queue, &task)?; self.queue.write_all(b"\n")?; Ok(UpdateFile::new(self.update_files.join(format!("{}.jsonl", task.uid)))) @@ -126,6 +135,30 @@ impl TaskWriter { } } +pub struct BatchWriter { + queue: BufWriter, +} + +impl BatchWriter { + pub(crate) fn new(path: PathBuf) -> Result { + std::fs::create_dir(&path)?; + let queue = File::create(path.join("queue.jsonl"))?; + Ok(BatchWriter { queue: BufWriter::new(queue) }) + } + + /// Pushes batches in the dump. + pub fn push_batch(&mut self, batch: &Batch) -> Result<()> { + serde_json::to_writer(&mut self.queue, &batch)?; + self.queue.write_all(b"\n")?; + Ok(()) + } + + pub fn flush(mut self) -> Result<()> { + self.queue.flush()?; + Ok(()) + } +} + pub struct UpdateFile { path: PathBuf, writer: Option>, @@ -137,8 +170,8 @@ impl UpdateFile { } pub fn push_document(&mut self, document: &Document) -> Result<()> { - if let Some(writer) = self.writer.as_mut() { - writer.write_all(&serde_json::to_vec(document)?)?; + if let Some(mut writer) = self.writer.as_mut() { + serde_json::to_writer(&mut writer, &document)?; writer.write_all(b"\n")?; } else { let file = File::create(&self.path).unwrap(); @@ -205,8 +238,8 @@ pub(crate) mod test { use super::*; use crate::reader::Document; use crate::test::{ - create_test_api_keys, create_test_documents, create_test_dump, create_test_instance_uid, - create_test_settings, create_test_tasks, + create_test_api_keys, create_test_batches, create_test_documents, create_test_dump, + create_test_instance_uid, create_test_settings, create_test_tasks, }; fn create_directory_hierarchy(dir: &Path) -> String { @@ -281,8 +314,10 @@ pub(crate) mod test { let dump_path = dump.path(); // ==== checking global file hierarchy (we want to be sure there isn't too many files or too few) - insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r###" + insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r" . + ├---- batches/ + │ └---- queue.jsonl ├---- indexes/ │ └---- doggos/ │ │ ├---- documents.jsonl @@ -295,8 +330,9 @@ pub(crate) mod test { ├---- experimental-features.json ├---- instance_uid.uuid ├---- keys.jsonl - └---- metadata.json - "###); + ├---- metadata.json + └---- network.json + "); // ==== checking the top level infos let metadata = fs::read_to_string(dump_path.join("metadata.json")).unwrap(); @@ -349,6 +385,16 @@ pub(crate) mod test { } } + // ==== checking the batch queue + let batches_queue = fs::read_to_string(dump_path.join("batches/queue.jsonl")).unwrap(); + for (batch, expected) in batches_queue.lines().zip(create_test_batches()) { + let mut batch = serde_json::from_str::(batch).unwrap(); + if batch.details.settings == Some(Box::new(Settings::::default())) { + batch.details.settings = None; + } + assert_eq!(batch, expected, "{batch:#?}{expected:#?}"); + } + // ==== checking the keys let keys = fs::read_to_string(dump_path.join("keys.jsonl")).unwrap(); for (key, expected) in keys.lines().zip(create_test_api_keys()) { diff --git a/crates/dump/tests/assets/v6-with-network.dump b/crates/dump/tests/assets/v6-with-network.dump new file mode 100644 index 000000000..4d0d9ddc9 Binary files /dev/null and b/crates/dump/tests/assets/v6-with-network.dump differ diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 7e0341fcb..ca26e50c8 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::io; use dump::{KindDump, TaskDump, UpdateFile}; +use meilisearch_types::batches::{Batch, BatchId}; use meilisearch_types::heed::RwTxn; use meilisearch_types::milli; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; @@ -14,9 +15,15 @@ pub struct Dump<'a> { index_scheduler: &'a IndexScheduler, wtxn: RwTxn<'a>, + batch_to_task_mapping: HashMap, + indexes: HashMap, statuses: HashMap, kinds: HashMap, + + batch_indexes: HashMap, + batch_statuses: HashMap, + batch_kinds: HashMap, } impl<'a> Dump<'a> { @@ -27,12 +34,72 @@ impl<'a> Dump<'a> { Ok(Dump { index_scheduler, wtxn, + batch_to_task_mapping: HashMap::new(), indexes: HashMap::new(), statuses: HashMap::new(), kinds: HashMap::new(), + batch_indexes: HashMap::new(), + batch_statuses: HashMap::new(), + batch_kinds: HashMap::new(), }) } + /// Register a new batch coming from a dump in the scheduler. + /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. + pub fn register_dumped_batch(&mut self, batch: Batch) -> Result<()> { + self.index_scheduler.queue.batches.all_batches.put(&mut self.wtxn, &batch.uid, &batch)?; + if let Some(enqueued_at) = batch.enqueued_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.enqueued_at, + enqueued_at.earliest, + batch.uid, + )?; + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.enqueued_at, + enqueued_at.oldest, + batch.uid, + )?; + } + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.started_at, + batch.started_at, + batch.uid, + )?; + if let Some(finished_at) = batch.finished_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.finished_at, + finished_at, + batch.uid, + )?; + } + + for index in batch.stats.index_uids.keys() { + match self.batch_indexes.get_mut(index) { + Some(bitmap) => { + bitmap.insert(batch.uid); + } + None => { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(batch.uid); + self.batch_indexes.insert(index.to_string(), bitmap); + } + }; + } + + for status in batch.stats.status.keys() { + self.batch_statuses.entry(*status).or_default().insert(batch.uid); + } + for kind in batch.stats.types.keys() { + self.batch_kinds.entry(*kind).or_default().insert(batch.uid); + } + + Ok(()) + } + /// Register a new task coming from a dump in the scheduler. /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. pub fn register_dumped_task( @@ -149,6 +216,9 @@ impl<'a> Dump<'a> { }; self.index_scheduler.queue.tasks.all_tasks.put(&mut self.wtxn, &task.uid, &task)?; + if let Some(batch_id) = task.batch_uid { + self.batch_to_task_mapping.entry(batch_id).or_default().insert(task.uid); + } for index in task.indexes() { match self.indexes.get_mut(index) { @@ -198,6 +268,14 @@ impl<'a> Dump<'a> { /// Commit all the changes and exit the importing dump state pub fn finish(mut self) -> Result<()> { + for (batch_id, task_ids) in self.batch_to_task_mapping { + self.index_scheduler.queue.batch_to_tasks_mapping.put( + &mut self.wtxn, + &batch_id, + &task_ids, + )?; + } + for (index, bitmap) in self.indexes { self.index_scheduler.queue.tasks.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; } @@ -208,6 +286,16 @@ impl<'a> Dump<'a> { self.index_scheduler.queue.tasks.put_kind(&mut self.wtxn, kind, &bitmap)?; } + for (index, bitmap) in self.batch_indexes { + self.index_scheduler.queue.batches.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; + } + for (status, bitmap) in self.batch_statuses { + self.index_scheduler.queue.batches.put_status(&mut self.wtxn, status, &bitmap)?; + } + for (kind, bitmap) in self.batch_kinds { + self.index_scheduler.queue.batches.put_kind(&mut self.wtxn, kind, &bitmap)?; + } + self.wtxn.commit()?; self.index_scheduler.scheduler.wake_up.signal(); diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index d3feecd73..280127d04 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -109,6 +109,8 @@ pub enum Error { InvalidIndexUid { index_uid: String }, #[error("Task `{0}` not found.")] TaskNotFound(TaskId), + #[error("Task `{0}` does not contain any documents. Only `documentAdditionOrUpdate` tasks with the statuses `enqueued` or `processing` contain documents")] + TaskFileNotFound(TaskId), #[error("Batch `{0}` not found.")] BatchNotFound(BatchId), #[error("Query parameters to filter the tasks to delete are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] @@ -127,8 +129,8 @@ pub enum Error { _ => format!("{error}") })] Milli { error: milli::Error, index_uid: Option }, - #[error("An unexpected crash occurred when processing the task.")] - ProcessBatchPanicked, + #[error("An unexpected crash occurred when processing the task: {0}")] + ProcessBatchPanicked(String), #[error(transparent)] FileStore(#[from] file_store::Error), #[error(transparent)] @@ -189,6 +191,7 @@ impl Error { | Error::InvalidTaskCanceledBy { .. } | Error::InvalidIndexUid { .. } | Error::TaskNotFound(_) + | Error::TaskFileNotFound(_) | Error::BatchNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery @@ -196,7 +199,7 @@ impl Error { | Error::Dump(_) | Error::Heed(_) | Error::Milli { .. } - | Error::ProcessBatchPanicked + | Error::ProcessBatchPanicked(_) | Error::FileStore(_) | Error::IoError(_) | Error::Persist(_) @@ -250,6 +253,7 @@ impl ErrorCode for Error { Error::InvalidTaskCanceledBy { .. } => Code::InvalidTaskCanceledBy, Error::InvalidIndexUid { .. } => Code::InvalidIndexUid, Error::TaskNotFound(_) => Code::TaskNotFound, + Error::TaskFileNotFound(_) => Code::TaskFileNotFound, Error::BatchNotFound(_) => Code::BatchNotFound, Error::TaskDeletionWithEmptyQuery => Code::MissingTaskFilters, Error::TaskCancelationWithEmptyQuery => Code::MissingTaskFilters, @@ -257,7 +261,7 @@ impl ErrorCode for Error { Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice, Error::Dump(e) => e.error_code(), Error::Milli { error, .. } => error.error_code(), - Error::ProcessBatchPanicked => Code::Internal, + Error::ProcessBatchPanicked(_) => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), Error::FileStore(e) => e.error_code(), diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index c6c17b2d5..394e6518f 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -1,6 +1,6 @@ use std::sync::{Arc, RwLock}; -use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; +use meilisearch_types::features::{InstanceTogglableFeatures, Network, RuntimeTogglableFeatures}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RwTxn}; @@ -14,10 +14,16 @@ mod db_name { pub const EXPERIMENTAL_FEATURES: &str = "experimental-features"; } +mod db_keys { + pub const EXPERIMENTAL_FEATURES: &str = "experimental-features"; + pub const NETWORK: &str = "network"; +} + #[derive(Clone)] pub(crate) struct FeatureData { persisted: Database>, runtime: Arc>, + network: Arc>, } #[derive(Debug, Clone, Copy)] @@ -86,6 +92,32 @@ impl RoFeatures { .into()) } } + + pub fn check_network(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.network { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "network", + issue_link: "https://github.com/orgs/meilisearch/discussions/805", + } + .into()) + } + } + + pub fn check_get_task_documents_route(&self) -> Result<()> { + if self.runtime.get_task_documents_route { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action: "Getting the documents of an enqueued task", + feature: "get task documents route", + issue_link: "https://github.com/orgs/meilisearch/discussions/808", + } + .into()) + } + } } impl FeatureData { @@ -102,7 +134,7 @@ impl FeatureData { env.create_database(wtxn, Some(db_name::EXPERIMENTAL_FEATURES))?; let persisted_features: RuntimeTogglableFeatures = - runtime_features_db.get(wtxn, db_name::EXPERIMENTAL_FEATURES)?.unwrap_or_default(); + runtime_features_db.get(wtxn, db_keys::EXPERIMENTAL_FEATURES)?.unwrap_or_default(); let InstanceTogglableFeatures { metrics, logs_route, contains_filter } = instance_features; let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures { metrics: metrics || persisted_features.metrics, @@ -111,7 +143,14 @@ impl FeatureData { ..persisted_features })); - Ok(Self { persisted: runtime_features_db, runtime }) + let network_db = runtime_features_db.remap_data_type::>(); + let network: Network = network_db.get(wtxn, db_keys::NETWORK)?.unwrap_or_default(); + + Ok(Self { + persisted: runtime_features_db, + runtime, + network: Arc::new(RwLock::new(network)), + }) } pub fn put_runtime_features( @@ -119,7 +158,7 @@ impl FeatureData { mut wtxn: RwTxn, features: RuntimeTogglableFeatures, ) -> Result<()> { - self.persisted.put(&mut wtxn, db_name::EXPERIMENTAL_FEATURES, &features)?; + self.persisted.put(&mut wtxn, db_keys::EXPERIMENTAL_FEATURES, &features)?; wtxn.commit()?; // safe to unwrap, the lock will only fail if: @@ -140,4 +179,21 @@ impl FeatureData { pub fn features(&self) -> RoFeatures { RoFeatures::new(self) } + + pub fn put_network(&self, mut wtxn: RwTxn, new_network: Network) -> Result<()> { + self.persisted.remap_data_type::>().put( + &mut wtxn, + db_keys::NETWORK, + &new_network, + )?; + wtxn.commit()?; + + let mut network = self.network.write().unwrap(); + *network = new_network; + Ok(()) + } + + pub fn network(&self) -> Network { + Network::clone(&*self.network.read().unwrap()) + } } diff --git a/crates/index-scheduler/src/index_mapper/index_map.rs b/crates/index-scheduler/src/index_mapper/index_map.rs index 3031043a9..e4eb9bfb8 100644 --- a/crates/index-scheduler/src/index_mapper/index_map.rs +++ b/crates/index-scheduler/src/index_mapper/index_map.rs @@ -1,5 +1,7 @@ use std::collections::BTreeMap; +use std::env::VarError; use std::path::Path; +use std::str::FromStr; use std::time::Duration; use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions}; @@ -304,7 +306,18 @@ fn create_or_open_index( ) -> Result { let mut options = EnvOpenOptions::new(); options.map_size(clamp_to_page_size(map_size)); - options.max_readers(1024); + + // You can find more details about this experimental + // environment variable on the following GitHub discussion: + // + let max_readers = match std::env::var("MEILI_EXPERIMENTAL_INDEX_MAX_READERS") { + Ok(value) => u32::from_str(&value).unwrap(), + Err(VarError::NotPresent) => 1024, + Err(VarError::NotUnicode(value)) => panic!( + "Invalid unicode for the `MEILI_EXPERIMENTAL_INDEX_MAX_READERS` env var: {value:?}" + ), + }; + options.max_readers(max_readers); if enable_mdb_writemap { unsafe { options.flags(EnvFlags::WRITE_MAP) }; } diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index dad73d4c6..17d683bbb 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -106,6 +106,12 @@ pub struct IndexStats { /// are not returned to the disk after a deletion, this number is typically larger than /// `used_database_size` that only includes the size of the used pages. pub database_size: u64, + /// Number of embeddings in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embeddings: Option, + /// Number of embedded documents in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embedded_documents: Option, /// Size taken by the used pages of the index' DB, in bytes. /// /// As the DB backend does not return to the disk the pages that are not currently used by the DB, @@ -130,8 +136,11 @@ impl IndexStats { /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { + let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { number_of_documents: index.number_of_documents(rtxn)?, + number_of_embeddings: Some(arroy_stats.number_of_embeddings), + number_of_embedded_documents: Some(arroy_stats.documents.len()), database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 4bc2beb05..bb8827fdc 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::batches::Batch; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; @@ -341,10 +341,14 @@ pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database String { let mut snap = String::new(); - let Batch { uid, details, stats, started_at, finished_at, progress: _ } = batch; + let Batch { uid, details, stats, started_at, finished_at, progress: _, enqueued_at } = batch; if let Some(finished_at) = finished_at { assert!(finished_at > started_at); } + let BatchEnqueuedAt { earliest, oldest } = enqueued_at.unwrap(); + assert!(*started_at > earliest); + assert!(earliest >= oldest); + snap.push('{'); snap.push_str(&format!("uid: {uid}, ")); snap.push_str(&format!("details: {}, ", serde_json::to_string(details).unwrap())); diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 530b7bedc..3b61b5dc4 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -33,7 +33,7 @@ mod test_utils; pub mod upgrade; mod utils; pub mod uuid_codec; -mod versioning; +pub mod versioning; pub type Result = std::result::Result; pub type TaskId = u32; @@ -51,7 +51,7 @@ pub use features::RoFeatures; use flate2::bufread::GzEncoder; use flate2::Compression; use meilisearch_types::batches::Batch; -use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; +use meilisearch_types::features::{InstanceTogglableFeatures, Network, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::I128; use meilisearch_types::heed::{self, Env, RoTxn}; @@ -770,7 +770,16 @@ impl IndexScheduler { Ok(()) } - // TODO: consider using a type alias or a struct embedder/template + pub fn put_network(&self, network: Network) -> Result<()> { + let wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; + self.features.put_network(wtxn, network)?; + Ok(()) + } + + pub fn network(&self) -> Network { + self.features.network() + } + pub fn embedders( &self, index_uid: String, diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 58f01c770..fed26aeb7 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -96,6 +96,7 @@ make_enum_progress! { StartTheDumpCreation, DumpTheApiKeys, DumpTheTasks, + DumpTheBatches, DumpTheIndexes, DumpTheExperimentalFeatures, CompressTheDump, diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs index 5c8a573ab..970e41110 100644 --- a/crates/index-scheduler/src/queue/batches.rs +++ b/crates/index-scheduler/src/queue/batches.rs @@ -12,8 +12,8 @@ use time::OffsetDateTime; use super::{Query, Queue}; use crate::processing::ProcessingTasks; use crate::utils::{ - insert_task_datetime, keep_ids_within_datetimes, map_bound, remove_task_datetime, - ProcessingBatch, + insert_task_datetime, keep_ids_within_datetimes, map_bound, + remove_n_tasks_datetime_earlier_than, remove_task_datetime, ProcessingBatch, }; use crate::{Error, Result, BEI128}; @@ -181,6 +181,7 @@ impl BatchQueue { stats: batch.stats, started_at: batch.started_at, finished_at: batch.finished_at, + enqueued_at: batch.enqueued_at, }, )?; @@ -234,34 +235,25 @@ impl BatchQueue { // What we know, though, is that the task date is from before the enqueued_at, and max two timestamps have been written // to the DB per batches. if let Some(ref old_batch) = old_batch { - let started_at = old_batch.started_at.unix_timestamp_nanos(); - - // We have either one or two enqueued at to remove - let mut exit = old_batch.stats.total_nb_tasks.clamp(0, 2); - let mut iterator = self.enqueued_at.rev_iter_mut(wtxn)?; - while let Some(entry) = iterator.next() { - let (key, mut value) = entry?; - if key > started_at { - continue; - } - if value.remove(old_batch.uid) { - exit = exit.saturating_sub(1); - // Safe because the key and value are owned - unsafe { - iterator.put_current(&key, &value)?; - } - if exit == 0 { - break; - } - } + if let Some(enqueued_at) = old_batch.enqueued_at { + remove_task_datetime(wtxn, self.enqueued_at, enqueued_at.earliest, old_batch.uid)?; + remove_task_datetime(wtxn, self.enqueued_at, enqueued_at.oldest, old_batch.uid)?; + } else { + // If we don't have the enqueued at in the batch it means the database comes from the v1.12 + // and we still need to find the date by scrolling the database + remove_n_tasks_datetime_earlier_than( + wtxn, + self.enqueued_at, + old_batch.started_at, + old_batch.stats.total_nb_tasks.clamp(1, 2) as usize, + old_batch.uid, + )?; } } - if let Some(enqueued_at) = batch.oldest_enqueued_at { - insert_task_datetime(wtxn, self.enqueued_at, enqueued_at, batch.uid)?; - } - if let Some(enqueued_at) = batch.earliest_enqueued_at { - insert_task_datetime(wtxn, self.enqueued_at, enqueued_at, batch.uid)?; - } + // A finished batch MUST contains at least one task and have an enqueued_at + let enqueued_at = batch.enqueued_at.as_ref().unwrap(); + insert_task_datetime(wtxn, self.enqueued_at, enqueued_at.earliest, batch.uid)?; + insert_task_datetime(wtxn, self.enqueued_at, enqueued_at.oldest, batch.uid)?; // Update the started at and finished at if let Some(ref old_batch) = old_batch { diff --git a/crates/index-scheduler/src/queue/batches_test.rs b/crates/index-scheduler/src/queue/batches_test.rs index aa84cdaf0..38e7ad800 100644 --- a/crates/index-scheduler/src/queue/batches_test.rs +++ b/crates/index-scheduler/src/queue/batches_test.rs @@ -102,30 +102,33 @@ fn query_batches_simple() { .unwrap(); assert_eq!(batches.len(), 1); batches[0].started_at = OffsetDateTime::UNIX_EPOCH; + assert!(batches[0].enqueued_at.is_some()); + batches[0].enqueued_at = None; // Insta cannot snapshot our batches because the batch stats contains an enum as key: https://github.com/mitsuhiko/insta/issues/689 let batch = serde_json::to_string_pretty(&batches[0]).unwrap(); snapshot!(batch, @r#" - { - "uid": 0, - "details": { - "primaryKey": "mouse" - }, - "stats": { - "totalNbTasks": 1, - "status": { - "processing": 1 - }, - "types": { - "indexCreation": 1 - }, - "indexUids": { - "catto": 1 - } - }, - "startedAt": "1970-01-01T00:00:00Z", - "finishedAt": null + { + "uid": 0, + "details": { + "primaryKey": "mouse" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "processing": 1 + }, + "types": { + "indexCreation": 1 + }, + "indexUids": { + "catto": 1 } - "#); + }, + "startedAt": "1970-01-01T00:00:00Z", + "finishedAt": null, + "enqueuedAt": null + } + "#); let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; let (batches, _) = index_scheduler diff --git a/crates/index-scheduler/src/queue/mod.rs b/crates/index-scheduler/src/queue/mod.rs index c6a79fbb2..8850eb8fa 100644 --- a/crates/index-scheduler/src/queue/mod.rs +++ b/crates/index-scheduler/src/queue/mod.rs @@ -8,6 +8,7 @@ mod tasks_test; mod test; use std::collections::BTreeMap; +use std::fs::File as StdFile; use std::time::Duration; use file_store::FileStore; @@ -216,6 +217,11 @@ impl Queue { } } + /// Open and returns the task's content File. + pub fn update_file(&self, uuid: Uuid) -> file_store::Result { + self.file_store.get_update(uuid) + } + /// Delete a file from the index scheduler. /// /// Counterpart to the [`create_update_file`](IndexScheduler::create_update_file) method. diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs index eb3314496..3dbdd2db3 100644 --- a/crates/index-scheduler/src/queue/test.rs +++ b/crates/index-scheduler/src/queue/test.rs @@ -326,7 +326,7 @@ fn test_auto_deletion_of_tasks() { fn test_task_queue_is_full() { let (index_scheduler, mut handle) = IndexScheduler::test_with_custom_config(vec![], |config| { // that's the minimum map size possible - config.task_db_size = 1048576; + config.task_db_size = 1048576 * 3; None }); diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index eddf8fba7..9268bf3e7 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -166,13 +166,41 @@ impl IndexScheduler { let processing_batch = &mut processing_batch; let progress = progress.clone(); std::thread::scope(|s| { + let p = progress.clone(); let handle = std::thread::Builder::new() .name(String::from("batch-operation")) .spawn_scoped(s, move || { - cloned_index_scheduler.process_batch(batch, processing_batch, progress) + cloned_index_scheduler.process_batch(batch, processing_batch, p) }) .unwrap(); - handle.join().unwrap_or(Err(Error::ProcessBatchPanicked)) + + match handle.join() { + Ok(ret) => { + if ret.is_err() { + if let Ok(progress_view) = + serde_json::to_string(&progress.as_progress_view()) + { + tracing::warn!("Batch failed while doing: {progress_view}") + } + } + ret + } + Err(panic) => { + if let Ok(progress_view) = + serde_json::to_string(&progress.as_progress_view()) + { + tracing::warn!("Batch failed while doing: {progress_view}") + } + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + Err(Error::ProcessBatchPanicked(msg.to_string())) + } + } }) }; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 7eda1d56f..21233429c 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -2,7 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; -use meilisearch_types::batches::BatchId; +use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::{self}; @@ -16,7 +16,10 @@ use crate::processing::{ InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, UpdateIndexProgress, }; -use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch}; +use crate::utils::{ + self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, + ProcessingBatch, +}; use crate::{Error, IndexScheduler, Result, TaskId}; impl IndexScheduler { @@ -323,8 +326,17 @@ impl IndexScheduler { match ret { Ok(Ok(())) => (), Ok(Err(e)) => return Err(Error::DatabaseUpgrade(Box::new(e))), - Err(_e) => { - return Err(Error::DatabaseUpgrade(Box::new(Error::ProcessBatchPanicked))); + Err(e) => { + let msg = match e.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match e.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + return Err(Error::DatabaseUpgrade(Box::new(Error::ProcessBatchPanicked( + msg.to_string(), + )))); } } @@ -418,7 +430,6 @@ impl IndexScheduler { to_delete_tasks -= &enqueued_tasks; // 2. We now have a list of tasks to delete, delete them - let mut affected_indexes = HashSet::new(); let mut affected_statuses = HashSet::new(); let mut affected_kinds = HashSet::new(); @@ -515,9 +526,51 @@ impl IndexScheduler { tasks -= &to_delete_tasks; // We must remove the batch entirely if tasks.is_empty() { - self.queue.batches.all_batches.delete(wtxn, &batch_id)?; - self.queue.batch_to_tasks_mapping.delete(wtxn, &batch_id)?; + if let Some(batch) = self.queue.batches.get_batch(wtxn, batch_id)? { + if let Some(BatchEnqueuedAt { earliest, oldest }) = batch.enqueued_at { + remove_task_datetime( + wtxn, + self.queue.batches.enqueued_at, + earliest, + batch_id, + )?; + remove_task_datetime( + wtxn, + self.queue.batches.enqueued_at, + oldest, + batch_id, + )?; + } else { + // If we don't have the enqueued at in the batch it means the database comes from the v1.12 + // and we still need to find the date by scrolling the database + remove_n_tasks_datetime_earlier_than( + wtxn, + self.queue.batches.enqueued_at, + batch.started_at, + batch.stats.total_nb_tasks.clamp(1, 2) as usize, + batch_id, + )?; + } + remove_task_datetime( + wtxn, + self.queue.batches.started_at, + batch.started_at, + batch_id, + )?; + if let Some(finished_at) = batch.finished_at { + remove_task_datetime( + wtxn, + self.queue.batches.finished_at, + finished_at, + batch_id, + )?; + } + + self.queue.batches.all_batches.delete(wtxn, &batch_id)?; + self.queue.batch_to_tasks_mapping.delete(wtxn, &batch_id)?; + } } + // Anyway, we must remove the batch from all its reverse indexes. // The only way to do that is to check diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index 09c1020ac..a6d785b2f 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::fs::File; use std::io::BufWriter; use std::sync::atomic::Ordering; @@ -11,7 +12,9 @@ use meilisearch_types::tasks::{Details, KindWithContent, Status, Task}; use time::macros::format_description; use time::OffsetDateTime; -use crate::processing::{AtomicDocumentStep, AtomicTaskStep, DumpCreationProgress}; +use crate::processing::{ + AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, DumpCreationProgress, +}; use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { @@ -102,7 +105,40 @@ impl IndexScheduler { } dump_tasks.flush()?; - // 3. Dump the indexes + // 3. dump the batches + progress.update_progress(DumpCreationProgress::DumpTheBatches); + let mut dump_batches = dump.create_batches_queue()?; + + let (atomic_batch_progress, update_batch_progress) = + AtomicBatchStep::new(self.queue.batches.all_batches.len(&rtxn)? as u32); + progress.update_progress(update_batch_progress); + + for ret in self.queue.batches.all_batches.iter(&rtxn)? { + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + let (_, mut b) = ret?; + // In the case we're dumping ourselves we want to be marked as finished + // to not loop over ourselves indefinitely. + if b.uid == task.uid { + let finished_at = OffsetDateTime::now_utc(); + + // We're going to fake the date because we don't know if everything is going to go well. + // But we need to dump the task as finished and successful. + // If something fail everything will be set appropriately in the end. + let mut statuses = BTreeMap::new(); + statuses.insert(Status::Succeeded, b.stats.total_nb_tasks); + b.stats.status = statuses; + b.finished_at = Some(finished_at); + } + + dump_batches.push_batch(&b)?; + atomic_batch_progress.fetch_add(1, Ordering::Relaxed); + } + dump_batches.flush()?; + + // 4. Dump the indexes progress.update_progress(DumpCreationProgress::DumpTheIndexes); let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; let mut count = 0; @@ -142,7 +178,7 @@ impl IndexScheduler { let documents = index .all_documents(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // 3.1. Dump the documents + // 4.1. Dump the documents for ret in documents { if self.scheduler.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -204,7 +240,7 @@ impl IndexScheduler { atomic.fetch_add(1, Ordering::Relaxed); } - // 3.2. Dump the settings + // 4.2. Dump the settings let settings = meilisearch_types::settings::settings( index, &rtxn, @@ -215,10 +251,12 @@ impl IndexScheduler { Ok(()) })?; - // 4. Dump experimental feature settings + // 5. Dump experimental feature settings progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); let features = self.features().runtime_features(); dump.create_experimental_features(features)?; + let network = self.network(); + dump.create_network(network)?; let dump_uid = started_at.format(format_description!( "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap index dd3ed4c8a..89f87e29a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap @@ -56,16 +56,13 @@ succeeded [1,] ### Batches Index Tasks: ---------------------------------------------------------------------- ### Batches Enqueued At: -[timestamp] [0,] [timestamp] [1,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Started At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Finished At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### File Store: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap index 9512a8d8d..135b272cd 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap @@ -54,15 +54,12 @@ succeeded [1,] ### Batches Index Tasks: ---------------------------------------------------------------------- ### Batches Enqueued At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Started At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Finished At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### File Store: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap index 3f3a6f769..b0c450092 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap @@ -7,7 +7,7 @@ snapshot_kind: text [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "An unexpected crash occurred when processing the task.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "An unexpected crash occurred when processing the task: simulated panic", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap index 9e490843e..4c828b71d 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap @@ -87,7 +87,6 @@ doggo [2,3,] girafo [4,] ---------------------------------------------------------------------- ### Batches Enqueued At: -[timestamp] [0,] [timestamp] [1,] [timestamp] [2,] [timestamp] [3,] @@ -95,7 +94,6 @@ girafo [4,] [timestamp] [5,] ---------------------------------------------------------------------- ### Batches Started At: -[timestamp] [0,] [timestamp] [1,] [timestamp] [2,] [timestamp] [3,] @@ -103,7 +101,6 @@ girafo [4,] [timestamp] [5,] ---------------------------------------------------------------------- ### Batches Finished At: -[timestamp] [0,] [timestamp] [1,] [timestamp] [2,] [timestamp] [3,] diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index a8ef88d56..44120ff64 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -903,7 +903,7 @@ fn create_and_list_index() { index_scheduler.index("kefir").unwrap(); let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); - snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#" + snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###" [ 1, [ @@ -912,6 +912,8 @@ fn create_and_list_index() { { "number_of_documents": 0, "database_size": "[bytes]", + "number_of_embeddings": 0, + "number_of_embedded_documents": 0, "used_database_size": "[bytes]", "primary_key": null, "field_distribution": {}, @@ -921,5 +923,5 @@ fn create_and_list_index() { ] ] ] - "#); + "###); } diff --git a/crates/index-scheduler/src/scheduler/test_failure.rs b/crates/index-scheduler/src/scheduler/test_failure.rs index 712fe01a5..5cdcb248b 100644 --- a/crates/index-scheduler/src/scheduler/test_failure.rs +++ b/crates/index-scheduler/src/scheduler/test_failure.rs @@ -6,8 +6,7 @@ use meili_snap::snapshot; use meilisearch_types::milli::obkv_to_json; use meilisearch_types::milli::update::IndexDocumentsMethod::*; use meilisearch_types::milli::update::Setting; -use meilisearch_types::tasks::Kind; -use meilisearch_types::tasks::KindWithContent; +use meilisearch_types::tasks::{Kind, KindWithContent}; use crate::insta_snapshot::snapshot_index_scheduler; use crate::test_utils::Breakpoint::*; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 80a0bb5ff..42bf253ad 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, HashSet}; use std::ops::Bound; -use meilisearch_types::batches::{Batch, BatchId, BatchStats}; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats}; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::task_view::DetailsView; @@ -30,8 +30,7 @@ pub struct ProcessingBatch { pub kinds: HashSet, pub indexes: HashSet, pub canceled_by: HashSet, - pub oldest_enqueued_at: Option, - pub earliest_enqueued_at: Option, + pub enqueued_at: Option, pub started_at: OffsetDateTime, pub finished_at: Option, } @@ -51,8 +50,7 @@ impl ProcessingBatch { kinds: HashSet::default(), indexes: HashSet::default(), canceled_by: HashSet::default(), - oldest_enqueued_at: None, - earliest_enqueued_at: None, + enqueued_at: None, started_at: OffsetDateTime::now_utc(), finished_at: None, } @@ -80,14 +78,18 @@ impl ProcessingBatch { if let Some(canceled_by) = task.canceled_by { self.canceled_by.insert(canceled_by); } - self.oldest_enqueued_at = - Some(self.oldest_enqueued_at.map_or(task.enqueued_at, |oldest_enqueued_at| { - task.enqueued_at.min(oldest_enqueued_at) - })); - self.earliest_enqueued_at = - Some(self.earliest_enqueued_at.map_or(task.enqueued_at, |earliest_enqueued_at| { - task.enqueued_at.max(earliest_enqueued_at) - })); + match self.enqueued_at.as_mut() { + Some(BatchEnqueuedAt { earliest, oldest }) => { + *oldest = task.enqueued_at.min(*oldest); + *earliest = task.enqueued_at.max(*earliest); + } + None => { + self.enqueued_at = Some(BatchEnqueuedAt { + earliest: task.enqueued_at, + oldest: task.enqueued_at, + }); + } + } } } @@ -138,6 +140,7 @@ impl ProcessingBatch { stats: self.stats.clone(), started_at: self.started_at, finished_at: self.finished_at, + enqueued_at: self.enqueued_at, } } } @@ -174,6 +177,33 @@ pub(crate) fn remove_task_datetime( Ok(()) } +pub(crate) fn remove_n_tasks_datetime_earlier_than( + wtxn: &mut RwTxn, + database: Database, + earlier_than: OffsetDateTime, + mut count: usize, + task_id: TaskId, +) -> Result<()> { + let earlier_than = earlier_than.unix_timestamp_nanos(); + let mut iter = database.rev_range_mut(wtxn, &(..earlier_than))?; + while let Some((current, mut existing)) = iter.next().transpose()? { + count -= existing.remove(task_id) as usize; + + if existing.is_empty() { + // safety: We don't keep references to the database + unsafe { iter.del_current()? }; + } else { + // safety: We don't keep references to the database + unsafe { iter.put_current(¤t, &existing)? }; + } + if count == 0 { + break; + } + } + + Ok(()) +} + pub(crate) fn keep_ids_within_datetimes( rtxn: &RoTxn, ids: &mut RoaringBitmap, @@ -329,14 +359,27 @@ impl crate::IndexScheduler { kind, } = task; assert_eq!(uid, task.uid); - if let Some(ref batch) = batch_uid { + if task.status != Status::Enqueued { + let batch_uid = batch_uid.expect("All non enqueued tasks must be part of a batch"); assert!(self .queue .batch_to_tasks_mapping - .get(&rtxn, batch) + .get(&rtxn, &batch_uid) .unwrap() .unwrap() .contains(uid)); + let batch = self.queue.batches.get_batch(&rtxn, batch_uid).unwrap().unwrap(); + assert_eq!(batch.uid, batch_uid); + if task.status == Status::Processing { + assert!(batch.progress.is_some()); + } else { + assert!(batch.progress.is_none()); + } + assert_eq!(batch.started_at, task.started_at.unwrap()); + assert_eq!(batch.finished_at, task.finished_at); + let enqueued_at = batch.enqueued_at.unwrap(); + assert!(task.enqueued_at >= enqueued_at.oldest); + assert!(task.enqueued_at <= enqueued_at.earliest); } if let Some(task_index_uid) = &task_index_uid { assert!(self diff --git a/crates/index-scheduler/src/versioning.rs b/crates/index-scheduler/src/versioning.rs index f4c502b6f..22132bf5f 100644 --- a/crates/index-scheduler/src/versioning.rs +++ b/crates/index-scheduler/src/versioning.rs @@ -1,9 +1,10 @@ -use crate::{upgrade::upgrade_index_scheduler, Result}; -use meilisearch_types::{ - heed::{types::Str, Database, Env, RoTxn, RwTxn}, - milli::heed_codec::version::VersionCodec, - versioning, -}; +use meilisearch_types::heed::types::Str; +use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn}; +use meilisearch_types::milli::heed_codec::version::VersionCodec; +use meilisearch_types::versioning; + +use crate::upgrade::upgrade_index_scheduler; +use crate::Result; /// The number of database used by queue itself const NUMBER_OF_DATABASES: u32 = 1; @@ -21,30 +22,38 @@ pub struct Versioning { } impl Versioning { - pub(crate) const fn nb_db() -> u32 { + pub const fn nb_db() -> u32 { NUMBER_OF_DATABASES } - pub fn get_version(&self, rtxn: &RoTxn) -> Result> { - Ok(self.version.get(rtxn, entry_name::MAIN)?) + pub fn get_version(&self, rtxn: &RoTxn) -> Result, heed::Error> { + self.version.get(rtxn, entry_name::MAIN) } - pub fn set_version(&self, wtxn: &mut RwTxn, version: (u32, u32, u32)) -> Result<()> { - Ok(self.version.put(wtxn, entry_name::MAIN, &version)?) + pub fn set_version( + &self, + wtxn: &mut RwTxn, + version: (u32, u32, u32), + ) -> Result<(), heed::Error> { + self.version.put(wtxn, entry_name::MAIN, &version) } - pub fn set_current_version(&self, wtxn: &mut RwTxn) -> Result<()> { + pub fn set_current_version(&self, wtxn: &mut RwTxn) -> Result<(), heed::Error> { let major = versioning::VERSION_MAJOR.parse().unwrap(); let minor = versioning::VERSION_MINOR.parse().unwrap(); let patch = versioning::VERSION_PATCH.parse().unwrap(); self.set_version(wtxn, (major, minor, patch)) } - /// Create an index scheduler and start its run loop. + /// Return `Self` without checking anything about the version + pub fn raw_new(env: &Env, wtxn: &mut RwTxn) -> Result { + let version = env.create_database(wtxn, Some(db_name::VERSION))?; + Ok(Self { version }) + } + pub(crate) fn new(env: &Env, db_version: (u32, u32, u32)) -> Result { let mut wtxn = env.write_txn()?; - let version = env.create_database(&mut wtxn, Some(db_name::VERSION))?; - let this = Self { version }; + let this = Self::raw_new(env, &mut wtxn)?; let from = match this.get_version(&wtxn)? { Some(version) => version, // fresh DB: use the db version diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 7910a5af4..663f5cb8d 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -24,9 +24,35 @@ pub struct Batch { pub started_at: OffsetDateTime, #[serde(with = "time::serde::rfc3339::option")] pub finished_at: Option, + + // Enqueued at is never displayed and is only required when removing a batch. + // It's always some except when upgrading from a database pre v1.12 + pub enqueued_at: Option, } -#[derive(Default, Debug, Clone, Serialize, Deserialize, ToSchema)] +impl PartialEq for Batch { + fn eq(&self, other: &Self) -> bool { + let Self { uid, progress, details, stats, started_at, finished_at, enqueued_at } = self; + + *uid == other.uid + && progress.is_none() == other.progress.is_none() + && details == &other.details + && stats == &other.stats + && started_at == &other.started_at + && finished_at == &other.finished_at + && enqueued_at == &other.enqueued_at + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct BatchEnqueuedAt { + #[serde(with = "time::serde::rfc3339")] + pub earliest: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub oldest: OffsetDateTime, +} + +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct BatchStats { diff --git a/crates/meilisearch-types/src/deserr/mod.rs b/crates/meilisearch-types/src/deserr/mod.rs index 3c5e0fcf8..f5ad18d5c 100644 --- a/crates/meilisearch-types/src/deserr/mod.rs +++ b/crates/meilisearch-types/src/deserr/mod.rs @@ -193,6 +193,8 @@ merge_with_error_impl_take_error_message!(ParseTaskKindError); merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(InvalidMultiSearchWeight); +merge_with_error_impl_take_error_message!(InvalidNetworkUrl); +merge_with_error_impl_take_error_message!(InvalidNetworkSearchApiKey); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold); merge_with_error_impl_take_error_message!(InvalidSimilarRankingScoreThreshold); diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 8caeb70c2..f64301b8c 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -260,7 +260,13 @@ InvalidMultiSearchMergeFacets , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryFacets , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryPagination , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryRankingRules , InvalidRequest , BAD_REQUEST ; +InvalidMultiSearchQueryPosition , InvalidRequest , BAD_REQUEST ; +InvalidMultiSearchRemote , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchWeight , InvalidRequest , BAD_REQUEST ; +InvalidNetworkRemotes , InvalidRequest , BAD_REQUEST ; +InvalidNetworkSelf , InvalidRequest , BAD_REQUEST ; +InvalidNetworkSearchApiKey , InvalidRequest , BAD_REQUEST ; +InvalidNetworkUrl , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; @@ -351,14 +357,22 @@ MissingDocumentId , InvalidRequest , BAD_REQUEST ; MissingFacetSearchFacetName , InvalidRequest , BAD_REQUEST ; MissingIndexUid , InvalidRequest , BAD_REQUEST ; MissingMasterKey , Auth , UNAUTHORIZED ; +MissingNetworkUrl , InvalidRequest , BAD_REQUEST ; MissingPayload , InvalidRequest , BAD_REQUEST ; MissingSearchHybrid , InvalidRequest , BAD_REQUEST ; MissingSwapIndexes , InvalidRequest , BAD_REQUEST ; MissingTaskFilters , InvalidRequest , BAD_REQUEST ; NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY; PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ; +RemoteBadResponse , System , BAD_GATEWAY ; +RemoteBadRequest , InvalidRequest , BAD_REQUEST ; +RemoteCouldNotSendRequest , System , BAD_GATEWAY ; +RemoteInvalidApiKey , Auth , FORBIDDEN ; +RemoteRemoteError , System , BAD_GATEWAY ; +RemoteTimeout , System , BAD_GATEWAY ; TooManySearchRequests , System , SERVICE_UNAVAILABLE ; TaskNotFound , InvalidRequest , NOT_FOUND ; +TaskFileNotFound , InvalidRequest , NOT_FOUND ; BatchNotFound , InvalidRequest , NOT_FOUND ; TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ; TooManyVectors , InvalidRequest , BAD_REQUEST ; @@ -583,6 +597,18 @@ impl fmt::Display for deserr_codes::InvalidSimilarRankingScoreThreshold { } } +impl fmt::Display for deserr_codes::InvalidNetworkUrl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "the value of `url` is invalid, expected a string.") + } +} + +impl fmt::Display for deserr_codes::InvalidNetworkSearchApiKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "the value of `searchApiKey` is invalid, expected a string.") + } +} + #[macro_export] macro_rules! internal_error { ($target:ty : $($other:path), *) => { diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index ba67f996b..37a504039 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq)] @@ -7,6 +9,8 @@ pub struct RuntimeTogglableFeatures { pub logs_route: bool, pub edit_documents_by_function: bool, pub contains_filter: bool, + pub network: bool, + pub get_task_documents_route: bool, } #[derive(Default, Debug, Clone, Copy)] @@ -15,3 +19,20 @@ pub struct InstanceTogglableFeatures { pub logs_route: bool, pub contains_filter: bool, } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Remote { + pub url: String, + #[serde(default)] + pub search_api_key: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub struct Network { + #[serde(default, rename = "self")] + pub local: Option, + #[serde(default)] + pub remotes: BTreeMap, +} diff --git a/crates/meilisearch-types/src/index_uid.rs b/crates/meilisearch-types/src/index_uid.rs index 4bf126794..87efd261c 100644 --- a/crates/meilisearch-types/src/index_uid.rs +++ b/crates/meilisearch-types/src/index_uid.rs @@ -4,13 +4,14 @@ use std::fmt; use std::str::FromStr; use deserr::Deserr; +use serde::Serialize; use utoipa::ToSchema; use crate::error::{Code, ErrorCode}; /// An index uid is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// bytes long -#[derive(Debug, Clone, PartialEq, Eq, Deserr, PartialOrd, Ord, ToSchema)] +#[derive(Debug, Clone, PartialEq, Eq, Deserr, PartialOrd, Ord, Serialize, ToSchema)] #[deserr(try_from(String) = IndexUid::try_from -> IndexUidFormatError)] #[schema(value_type = String, example = "movies")] pub struct IndexUid(String); diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index 8fcbab14d..27f2047ee 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -302,6 +302,12 @@ pub enum Action { #[serde(rename = "experimental.update")] #[deserr(rename = "experimental.update")] ExperimentalFeaturesUpdate, + #[serde(rename = "network.get")] + #[deserr(rename = "network.get")] + NetworkGet, + #[serde(rename = "network.update")] + #[deserr(rename = "network.update")] + NetworkUpdate, } impl Action { @@ -341,6 +347,8 @@ impl Action { KEYS_DELETE => Some(Self::KeysDelete), EXPERIMENTAL_FEATURES_GET => Some(Self::ExperimentalFeaturesGet), EXPERIMENTAL_FEATURES_UPDATE => Some(Self::ExperimentalFeaturesUpdate), + NETWORK_GET => Some(Self::NetworkGet), + NETWORK_UPDATE => Some(Self::NetworkUpdate), _otherwise => None, } } @@ -386,4 +394,7 @@ pub mod actions { pub const KEYS_DELETE: u8 = KeysDelete.repr(); pub const EXPERIMENTAL_FEATURES_GET: u8 = ExperimentalFeaturesGet.repr(); pub const EXPERIMENTAL_FEATURES_UPDATE: u8 = ExperimentalFeaturesUpdate.repr(); + + pub const NETWORK_GET: u8 = NetworkGet.repr(); + pub const NETWORK_UPDATE: u8 = NetworkUpdate.repr(); } diff --git a/crates/meilisearch-types/src/versioning.rs b/crates/meilisearch-types/src/versioning.rs index f009002d1..07e42c2ce 100644 --- a/crates/meilisearch-types/src/versioning.rs +++ b/crates/meilisearch-types/src/versioning.rs @@ -1,7 +1,10 @@ use std::fs; -use std::io::{self, ErrorKind}; +use std::io::{ErrorKind, Write}; use std::path::Path; +use milli::heed; +use tempfile::NamedTempFile; + /// The name of the file that contains the version of the database. pub const VERSION_FILE_NAME: &str = "VERSION"; @@ -10,37 +13,7 @@ pub static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR"); pub static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH"); /// Persists the version of the current Meilisearch binary to a VERSION file -pub fn update_version_file_for_dumpless_upgrade( - db_path: &Path, - from: (u32, u32, u32), - to: (u32, u32, u32), -) -> Result<(), VersionFileError> { - let (from_major, from_minor, from_patch) = from; - let (to_major, to_minor, to_patch) = to; - - if from_major > to_major - || (from_major == to_major && from_minor > to_minor) - || (from_major == to_major && from_minor == to_minor && from_patch > to_patch) - { - Err(VersionFileError::DowngradeNotSupported { - major: from_major, - minor: from_minor, - patch: from_patch, - }) - } else if from_major < 1 || (from_major == to_major && from_minor < 12) { - Err(VersionFileError::TooOldForAutomaticUpgrade { - major: from_major, - minor: from_minor, - patch: from_patch, - }) - } else { - create_current_version_file(db_path)?; - Ok(()) - } -} - -/// Persists the version of the current Meilisearch binary to a VERSION file -pub fn create_current_version_file(db_path: &Path) -> io::Result<()> { +pub fn create_current_version_file(db_path: &Path) -> anyhow::Result<()> { create_version_file(db_path, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) } @@ -49,9 +22,14 @@ pub fn create_version_file( major: &str, minor: &str, patch: &str, -) -> io::Result<()> { +) -> anyhow::Result<()> { let version_path = db_path.join(VERSION_FILE_NAME); - fs::write(version_path, format!("{}.{}.{}", major, minor, patch)) + // In order to persist the file later we must create it in the `data.ms` and not in `/tmp` + let mut file = NamedTempFile::new_in(db_path)?; + file.write_all(format!("{}.{}.{}", major, minor, patch).as_bytes())?; + file.flush()?; + file.persist(version_path)?; + Ok(()) } pub fn get_version(db_path: &Path) -> Result<(u32, u32, u32), VersionFileError> { @@ -61,7 +39,7 @@ pub fn get_version(db_path: &Path) -> Result<(u32, u32, u32), VersionFileError> Ok(version) => parse_version(&version), Err(error) => match error.kind() { ErrorKind::NotFound => Err(VersionFileError::MissingVersionFile), - _ => Err(error.into()), + _ => Err(anyhow::Error::from(error).into()), }, } } @@ -112,7 +90,9 @@ pub enum VersionFileError { DowngradeNotSupported { major: u32, minor: u32, patch: u32 }, #[error("Database version {major}.{minor}.{patch} is too old for the experimental dumpless upgrade feature. Please generate a dump using the v{major}.{minor}.{patch} and import it in the v{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_PATCH}")] TooOldForAutomaticUpgrade { major: u32, minor: u32, patch: u32 }, + #[error("Error while modifying the database: {0}")] + ErrorWhileModifyingTheDatabase(#[from] heed::Error), #[error(transparent)] - IoError(#[from] std::io::Error), + AnyhowError(#[from] anyhow::Error), } diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 9fc212cc4..63882468a 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -31,6 +31,7 @@ use crate::routes::{create_all_stats, Stats}; use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; +const MEILI_SERVER_PROVIDER: &str = "MEILI_SERVER_PROVIDER"; /// Write the instance-uid in the `data.ms` and in `~/.config/MeiliSearch/path-to-db-instance-uid`. Ignore the errors. fn write_user_id(db_path: &Path, user_id: &InstanceUid) { @@ -195,6 +196,8 @@ struct Infos { experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, experimental_limit_batched_tasks_total_size: u64, + experimental_network: bool, + experimental_get_task_documents_route: bool, gpu_enabled: bool, db_path: bool, import_dump: bool, @@ -285,6 +288,8 @@ impl Infos { logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, } = features; // We're going to override every sensible information. @@ -302,6 +307,8 @@ impl Infos { experimental_replication_parameters, experimental_enable_logs_route: experimental_enable_logs_route | logs_route, experimental_reduce_indexing_memory_usage, + experimental_network: network, + experimental_get_task_documents_route: get_task_documents_route, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), @@ -357,7 +364,7 @@ impl Segment { "cores": sys.cpus().len(), "ram_size": sys.total_memory(), "disk_size": disks.iter().map(|disk| disk.total_space()).max(), - "server_provider": std::env::var("MEILI_SERVER_PROVIDER").ok(), + "server_provider": std::env::var(MEILI_SERVER_PROVIDER).ok(), }) }); let number_of_documents = @@ -380,10 +387,18 @@ impl Segment { index_scheduler: Arc, auth_controller: Arc, ) { - const INTERVAL: Duration = Duration::from_secs(60 * 60); // one hour - // The first batch must be sent after one hour. + let interval: Duration = match std::env::var(MEILI_SERVER_PROVIDER) { + Ok(provider) if provider.starts_with("meili_cloud:") => { + Duration::from_secs(60 * 60) // one hour + } + _ => { + // We're an open source instance + Duration::from_secs(60 * 60 * 24) // one day + } + }; + let mut interval = - tokio::time::interval_at(tokio::time::Instant::now() + INTERVAL, INTERVAL); + tokio::time::interval_at(tokio::time::Instant::now() + interval, interval); loop { select! { diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 4d41c63ea..e22b6dff3 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -32,6 +32,7 @@ use analytics::Analytics; use anyhow::bail; use error::PayloadError; use extractors::payload::PayloadConfig; +use index_scheduler::versioning::Versioning; use index_scheduler::{IndexScheduler, IndexSchedulerOptions}; use meilisearch_auth::AuthController; use meilisearch_types::milli::constants::VERSION_MAJOR; @@ -40,10 +41,9 @@ use meilisearch_types::milli::update::{IndexDocumentsConfig, IndexDocumentsMetho use meilisearch_types::settings::apply_settings_to_builder; use meilisearch_types::tasks::KindWithContent; use meilisearch_types::versioning::{ - create_current_version_file, get_version, update_version_file_for_dumpless_upgrade, - VersionFileError, VERSION_MINOR, VERSION_PATCH, + create_current_version_file, get_version, VersionFileError, VERSION_MINOR, VERSION_PATCH, }; -use meilisearch_types::{compression, milli, VERSION_FILE_NAME}; +use meilisearch_types::{compression, heed, milli, VERSION_FILE_NAME}; pub use option::Opt; use option::ScheduleSnapshot; use search_queue::SearchQueue; @@ -356,14 +356,19 @@ fn open_or_create_database_unchecked( /// Ensures Meilisearch version is compatible with the database, returns an error in case of version mismatch. /// Returns the version that was contained in the version file -fn check_version(opt: &Opt, binary_version: (u32, u32, u32)) -> anyhow::Result<(u32, u32, u32)> { +fn check_version( + opt: &Opt, + index_scheduler_opt: &IndexSchedulerOptions, + binary_version: (u32, u32, u32), +) -> anyhow::Result<(u32, u32, u32)> { let (bin_major, bin_minor, bin_patch) = binary_version; let (db_major, db_minor, db_patch) = get_version(&opt.db_path)?; if db_major != bin_major || db_minor != bin_minor || db_patch > bin_patch { if opt.experimental_dumpless_upgrade { update_version_file_for_dumpless_upgrade( - &opt.db_path, + opt, + index_scheduler_opt, (db_major, db_minor, db_patch), (bin_major, bin_minor, bin_patch), )?; @@ -380,6 +385,57 @@ fn check_version(opt: &Opt, binary_version: (u32, u32, u32)) -> anyhow::Result<( Ok((db_major, db_minor, db_patch)) } +/// Persists the version of the current Meilisearch binary to a VERSION file +pub fn update_version_file_for_dumpless_upgrade( + opt: &Opt, + index_scheduler_opt: &IndexSchedulerOptions, + from: (u32, u32, u32), + to: (u32, u32, u32), +) -> Result<(), VersionFileError> { + let (from_major, from_minor, from_patch) = from; + let (to_major, to_minor, to_patch) = to; + + // Early exit in case of error + if from_major > to_major + || (from_major == to_major && from_minor > to_minor) + || (from_major == to_major && from_minor == to_minor && from_patch > to_patch) + { + return Err(VersionFileError::DowngradeNotSupported { + major: from_major, + minor: from_minor, + patch: from_patch, + }); + } else if from_major < 1 || (from_major == to_major && from_minor < 12) { + return Err(VersionFileError::TooOldForAutomaticUpgrade { + major: from_major, + minor: from_minor, + patch: from_patch, + }); + } + + // In the case of v1.12, the index-scheduler didn't store its internal version at the time. + // => We must write it immediately **in the index-scheduler** otherwise we'll update the version file + // there is a risk of DB corruption if a restart happens after writing the version file but before + // writing the version in the index-scheduler. See + if from_major == 1 && from_minor == 12 { + let env = unsafe { + heed::EnvOpenOptions::new() + .max_dbs(Versioning::nb_db()) + .map_size(index_scheduler_opt.task_db_size) + .open(&index_scheduler_opt.tasks_path) + }?; + let mut wtxn = env.write_txn()?; + let versioning = Versioning::raw_new(&env, &mut wtxn)?; + versioning.set_version(&mut wtxn, (from_major, from_minor, from_patch))?; + wtxn.commit()?; + // Should be instant since we're the only one using the env + env.prepare_for_closing().wait(); + } + + create_current_version_file(&opt.db_path)?; + Ok(()) +} + /// Ensure you're in a valid state and open the IndexScheduler + AuthController for you. fn open_or_create_database( opt: &Opt, @@ -387,7 +443,11 @@ fn open_or_create_database( empty_db: bool, binary_version: (u32, u32, u32), ) -> anyhow::Result<(IndexScheduler, AuthController)> { - let version = if !empty_db { check_version(opt, binary_version)? } else { binary_version }; + let version = if !empty_db { + check_version(opt, &index_scheduler_opt, binary_version)? + } else { + binary_version + }; open_or_create_database_unchecked(opt, index_scheduler_opt, OnFailure::KeepDb, version) } @@ -431,10 +491,13 @@ fn import_dump( keys.push(key); } - // 3. Import the runtime features. + // 3. Import the runtime features and network let features = dump_reader.features()?.unwrap_or_default(); index_scheduler.put_runtime_features(features)?; + let network = dump_reader.network()?.cloned().unwrap_or_default(); + index_scheduler.put_network(network)?; + let indexer_config = index_scheduler.indexer_config(); // /!\ The tasks must be imported AFTER importing the indexes or else the scheduler might @@ -508,9 +571,15 @@ fn import_dump( index_scheduler.refresh_index_stats(&uid)?; } + // 5. Import the queue let mut index_scheduler_dump = index_scheduler.register_dumped_task()?; + // 5.1. Import the batches + for ret in dump_reader.batches()? { + let batch = ret?; + index_scheduler_dump.register_dumped_batch(batch)?; + } - // 5. Import the tasks. + // 5.2. Import the tasks for ret in dump_reader.tasks()? { let (task, file) = ret?; index_scheduler_dump.register_dumped_task(task, file)?; diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index f46bda5a0..402bc11ae 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -50,6 +50,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { logs_route: Some(false), edit_documents_by_function: Some(false), contains_filter: Some(false), + network: Some(false), + get_task_documents_route: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -88,6 +90,10 @@ pub struct RuntimeTogglableFeatures { pub edit_documents_by_function: Option, #[deserr(default)] pub contains_filter: Option, + #[deserr(default)] + pub network: Option, + #[deserr(default)] + pub get_task_documents_route: Option, } impl From for RuntimeTogglableFeatures { @@ -97,6 +103,8 @@ impl From for RuntimeTogg logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, } = value; Self { @@ -104,6 +112,8 @@ impl From for RuntimeTogg logs_route: Some(logs_route), edit_documents_by_function: Some(edit_documents_by_function), contains_filter: Some(contains_filter), + network: Some(network), + get_task_documents_route: Some(get_task_documents_route), } } } @@ -114,6 +124,8 @@ pub struct PatchExperimentalFeatureAnalytics { logs_route: bool, edit_documents_by_function: bool, contains_filter: bool, + network: bool, + get_task_documents_route: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -127,6 +139,8 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { logs_route: new.logs_route, edit_documents_by_function: new.edit_documents_by_function, contains_filter: new.contains_filter, + network: new.network, + get_task_documents_route: new.get_task_documents_route, }) } @@ -149,6 +163,8 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { logs_route: Some(false), edit_documents_by_function: Some(false), contains_filter: Some(false), + network: Some(false), + get_task_documents_route: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -181,16 +197,23 @@ async fn patch_features( .edit_documents_by_function .unwrap_or(old_features.edit_documents_by_function), contains_filter: new_features.0.contains_filter.unwrap_or(old_features.contains_filter), + network: new_features.0.network.unwrap_or(old_features.network), + get_task_documents_route: new_features + .0 + .get_task_documents_route + .unwrap_or(old_features.get_task_documents_route), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because - // the it renames to camelCase, which we don't want for analytics. + // it renames to camelCase, which we don't want for analytics. // **Do not** ignore fields with `..` or `_` here, because we want to add them in the future. let meilisearch_types::features::RuntimeTogglableFeatures { metrics, logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, } = new_features; analytics.publish( @@ -199,6 +222,8 @@ async fn patch_features( logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index a03d5f691..7ca8e407f 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -496,6 +496,12 @@ pub struct IndexStats { pub number_of_documents: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, + /// Number of embeddings in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embeddings: Option, + /// Number of embedded documents in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embedded_documents: Option, /// Association of every field name with the number of times it occurs in the documents. #[schema(value_type = HashMap)] pub field_distribution: FieldDistribution, @@ -506,6 +512,8 @@ impl From for IndexStats { IndexStats { number_of_documents: stats.inner_stats.number_of_documents, is_indexing: stats.is_indexing, + number_of_embeddings: stats.inner_stats.number_of_embeddings, + number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, field_distribution: stats.inner_stats.field_distribution, } } @@ -524,6 +532,8 @@ impl From for IndexStats { (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( { "numberOfDocuments": 10, + "numberOfEmbeddings": 10, + "numberOfEmbeddedDocuments": 10, "isIndexing": true, "fieldDistribution": { "genre": 10, diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 3dcefdf46..65a12b692 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -34,6 +34,7 @@ use crate::routes::features::RuntimeTogglableFeatures; use crate::routes::indexes::documents::{DocumentDeletionByFilter, DocumentEditionByFunction}; use crate::routes::indexes::IndexView; use crate::routes::multi_search::SearchResults; +use crate::routes::network::{Network, Remote}; use crate::routes::swap_indexes::SwapIndexesPayload; use crate::search::{ FederatedSearch, FederatedSearchResult, Federation, FederationOptions, MergeFacets, @@ -54,6 +55,7 @@ mod logs; mod metrics; mod multi_search; mod multi_search_analytics; +pub mod network; mod open_api_utils; mod snapshot; mod swap_indexes; @@ -75,6 +77,7 @@ pub mod tasks; (path = "/multi-search", api = multi_search::MultiSearchApi), (path = "/swap-indexes", api = swap_indexes::SwapIndexesApi), (path = "/experimental-features", api = features::ExperimentalFeaturesApi), + (path = "/network", api = network::NetworkApi), ), paths(get_health, get_version, get_stats), tags( @@ -85,7 +88,7 @@ pub mod tasks; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote)) )] pub struct MeilisearchApi; @@ -103,7 +106,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/multi-search").configure(multi_search::configure)) .service(web::scope("/swap-indexes").configure(swap_indexes::configure)) .service(web::scope("/metrics").configure(metrics::configure)) - .service(web::scope("/experimental-features").configure(features::configure)); + .service(web::scope("/experimental-features").configure(features::configure)) + .service(web::scope("/network").configure(network::configure)); #[cfg(feature = "swagger")] { @@ -359,9 +363,9 @@ pub async fn running() -> HttpResponse { #[derive(Serialize, Debug, ToSchema)] #[serde(rename_all = "camelCase")] pub struct Stats { - /// The size of the database, in bytes. + /// The disk space used by the database, in bytes. pub database_size: u64, - #[serde(skip)] + /// The size of the database, in bytes. pub used_database_size: u64, /// The date of the last update in the RFC 3339 formats. Can be `null` if no update has ever been processed. #[serde(serialize_with = "time::serde::rfc3339::option::serialize")] @@ -383,6 +387,7 @@ pub struct Stats { (status = 200, description = "The stats of the instance", body = Stats, content_type = "application/json", example = json!( { "databaseSize": 567, + "usedDatabaseSize": 456, "lastUpdate": "2019-11-20T09:40:33.711324Z", "indexes": { "movies": { diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index fcc3cd700..b3af98fd5 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -20,6 +20,7 @@ use crate::routes::indexes::search::search_kind; use crate::search::{ add_search_rules, perform_federated_search, perform_search, FederatedSearch, FederatedSearchResult, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, + PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE, }; use crate::search_queue::SearchQueue; @@ -48,6 +49,7 @@ pub struct SearchResults { /// Bundle multiple search queries in a single API request. Use this endpoint to search through multiple indexes at once. #[utoipa::path( post, + request_body = FederatedSearch, path = "", tag = "Multi-search", security(("Bearer" = ["search", "*"])), @@ -186,18 +188,22 @@ pub async fn multi_search_with_post( let response = match federation { Some(federation) => { - let search_result = tokio::task::spawn_blocking(move || { - perform_federated_search(&index_scheduler, queries, federation, features) - }) - .await; + // check remote header + let is_proxy = req + .headers() + .get(PROXY_SEARCH_HEADER) + .is_some_and(|value| value.as_bytes() == PROXY_SEARCH_HEADER_VALUE.as_bytes()); + let search_result = + perform_federated_search(&index_scheduler, queries, federation, features, is_proxy) + .await; permit.drop().await; - if let Ok(Ok(_)) = search_result { + if search_result.is_ok() { multi_aggregate.succeed(); } analytics.publish(multi_aggregate, &req); - HttpResponse::Ok().json(search_result??) + HttpResponse::Ok().json(search_result?) } None => { // Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only, diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index 3d07f471c..3fa23f630 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -13,6 +13,8 @@ pub struct MultiSearchAggregator { // sum of the number of distinct indexes in each single request, use with total_received to compute an avg total_distinct_index_count: usize, + // sum of the number of distinct remotes in each single request, use with total_received to compute an avg + total_distinct_remote_count: usize, // number of queries with a single index, use with total_received to compute a proportion total_single_index: usize, @@ -31,46 +33,49 @@ impl MultiSearchAggregator { pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { let use_federation = federated_search.federation.is_some(); - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; + let mut distinct_indexes = HashSet::with_capacity(federated_search.queries.len()); + let mut distinct_remotes = HashSet::with_capacity(federated_search.queries.len()); - index_uid.as_str() - }) - .collect(); + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + for SearchQueryWithIndex { + index_uid, + federation_options, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } in &federated_search.queries + { + if let Some(federation_options) = federation_options { + if let Some(remote) = &federation_options.remote { + distinct_remotes.insert(remote.as_str()); + } + } + + distinct_indexes.insert(index_uid.as_str()); + } let show_ranking_score = federated_search.queries.iter().any(|query| query.show_ranking_score); @@ -81,6 +86,7 @@ impl MultiSearchAggregator { total_received: 1, total_succeeded: 0, total_distinct_index_count: distinct_indexes.len(), + total_distinct_remote_count: distinct_remotes.len(), total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, total_search_count: federated_search.queries.len(), show_ranking_score, @@ -110,6 +116,8 @@ impl Aggregate for MultiSearchAggregator { let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); let total_distinct_index_count = this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_distinct_remote_count = + this.total_distinct_remote_count.saturating_add(new.total_distinct_remote_count); let total_single_index = this.total_single_index.saturating_add(new.total_single_index); let total_search_count = this.total_search_count.saturating_add(new.total_search_count); let show_ranking_score = this.show_ranking_score || new.show_ranking_score; @@ -121,6 +129,7 @@ impl Aggregate for MultiSearchAggregator { total_received, total_succeeded, total_distinct_index_count, + total_distinct_remote_count, total_single_index, total_search_count, show_ranking_score, @@ -134,6 +143,7 @@ impl Aggregate for MultiSearchAggregator { total_received, total_succeeded, total_distinct_index_count, + total_distinct_remote_count, total_single_index, total_search_count, show_ranking_score, @@ -152,6 +162,10 @@ impl Aggregate for MultiSearchAggregator { "total_distinct_index_count": total_distinct_index_count, "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early }, + "remotes": { + "total_distinct_remote_count": total_distinct_remote_count, + "avg_distinct_remote_count": (total_distinct_remote_count as f64) / (total_received as f64), // not 0 else returned early + }, "searches": { "total_search_count": total_search_count, "avg_search_count": (total_search_count as f64) / (total_received as f64), diff --git a/crates/meilisearch/src/routes/network.rs b/crates/meilisearch/src/routes/network.rs new file mode 100644 index 000000000..458ae8cbf --- /dev/null +++ b/crates/meilisearch/src/routes/network.rs @@ -0,0 +1,261 @@ +use std::collections::BTreeMap; + +use actix_web::web::{self, Data}; +use actix_web::{HttpRequest, HttpResponse}; +use deserr::actix_web::AwebJson; +use deserr::Deserr; +use index_scheduler::IndexScheduler; +use itertools::{EitherOrBoth, Itertools}; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::{ + InvalidNetworkRemotes, InvalidNetworkSearchApiKey, InvalidNetworkSelf, InvalidNetworkUrl, +}; +use meilisearch_types::error::ResponseError; +use meilisearch_types::features::{Network as DbNetwork, Remote as DbRemote}; +use meilisearch_types::keys::actions; +use meilisearch_types::milli::update::Setting; +use serde::Serialize; +use tracing::debug; +use utoipa::{OpenApi, ToSchema}; + +use crate::analytics::{Aggregate, Analytics}; +use crate::extractors::authentication::policies::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::extractors::sequential_extractor::SeqHandler; + +#[derive(OpenApi)] +#[openapi( + paths(get_network, patch_network), + tags(( + name = "Network", + description = "The `/network` route allows you to describe the topology of a network of Meilisearch instances. + +This route is **synchronous**. This means that no task object will be returned, and any change to the network will be made available immediately.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/network"), + )), +)] +pub struct NetworkApi; + +pub fn configure(cfg: &mut web::ServiceConfig) { + cfg.service( + web::resource("") + .route(web::get().to(get_network)) + .route(web::patch().to(SeqHandler(patch_network))), + ); +} + +/// Get network topology +/// +/// Get a list of all Meilisearch instances currently known to this instance. +#[utoipa::path( + get, + path = "", + tag = "Network", + security(("Bearer" = ["network.get", "network.*", "*"])), + responses( + (status = OK, description = "Known nodes are returned", body = Network, content_type = "application/json", example = json!( + { + "self": "ms-0", + "remotes": { + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + } + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn get_network( + index_scheduler: GuardedData, Data>, +) -> Result { + index_scheduler.features().check_network("Using the /network route")?; + + let network = index_scheduler.network(); + debug!(returns = ?network, "Get network"); + Ok(HttpResponse::Ok().json(network)) +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Remote { + #[schema(value_type = Option, example = json!({ + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + }))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub url: Setting, + #[schema(value_type = Option, example = json!("XWnBI8QHUc-4IlqbKPLUDuhftNq19mQtjc6JvmivzJU"))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub search_api_key: Setting, +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Network { + #[schema(value_type = Option>, example = json!("http://localhost:7700"))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub remotes: Setting>>, + #[schema(value_type = Option, example = json!("ms-00"), rename = "self")] + #[serde(default, rename = "self")] + #[deserr(default, rename = "self", error = DeserrJsonError)] + pub local: Setting, +} + +impl Remote { + pub fn try_into_db_node(self, name: &str) -> Result { + Ok(DbRemote { + url: self.url.set().ok_or(ResponseError::from_msg( + format!("Missing field `.remotes.{name}.url`"), + meilisearch_types::error::Code::MissingNetworkUrl, + ))?, + search_api_key: self.search_api_key.set(), + }) + } +} + +#[derive(Serialize)] +pub struct PatchNetworkAnalytics { + network_size: usize, + network_has_self: bool, +} + +impl Aggregate for PatchNetworkAnalytics { + fn event_name(&self) -> &'static str { + "Network Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { network_size: new.network_size, network_has_self: new.network_has_self }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + +/// Configure Network +/// +/// Add or remove nodes from network. +#[utoipa::path( + patch, + path = "", + tag = "Network", + request_body = Network, + security(("Bearer" = ["network.update", "network.*", "*"])), + responses( + (status = OK, description = "New network state is returned", body = Network, content_type = "application/json", example = json!( + { + "self": "ms-0", + "remotes": { + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + } + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn patch_network( + index_scheduler: GuardedData, Data>, + new_network: AwebJson, + req: HttpRequest, + analytics: Data, +) -> Result { + index_scheduler.features().check_network("Using the /network route")?; + + let new_network = new_network.0; + let old_network = index_scheduler.network(); + debug!(parameters = ?new_network, "Patch network"); + + let merged_self = match new_network.local { + Setting::Set(new_self) => Some(new_self), + Setting::Reset => None, + Setting::NotSet => old_network.local, + }; + + let merged_remotes = match new_network.remotes { + Setting::Set(new_remotes) => { + let mut merged_remotes = BTreeMap::new(); + for either_or_both in old_network + .remotes + .into_iter() + .merge_join_by(new_remotes.into_iter(), |left, right| left.0.cmp(&right.0)) + { + match either_or_both { + EitherOrBoth::Both((key, old), (_, Some(new))) => { + let DbRemote { url: old_url, search_api_key: old_search_api_key } = old; + + let Remote { url: new_url, search_api_key: new_search_api_key } = new; + + let merged = DbRemote { + url: match new_url { + Setting::Set(new_url) => new_url, + Setting::Reset => { + return Err(ResponseError::from_msg( + format!( + "Field `.remotes.{key}.url` cannot be set to `null`" + ), + meilisearch_types::error::Code::InvalidNetworkUrl, + )) + } + Setting::NotSet => old_url, + }, + search_api_key: match new_search_api_key { + Setting::Set(new_search_api_key) => Some(new_search_api_key), + Setting::Reset => None, + Setting::NotSet => old_search_api_key, + }, + }; + merged_remotes.insert(key, merged); + } + EitherOrBoth::Both((_, _), (_, None)) | EitherOrBoth::Right((_, None)) => {} + EitherOrBoth::Left((key, node)) => { + merged_remotes.insert(key, node); + } + EitherOrBoth::Right((key, Some(node))) => { + let node = node.try_into_db_node(&key)?; + merged_remotes.insert(key, node); + } + } + } + merged_remotes + } + Setting::Reset => BTreeMap::new(), + Setting::NotSet => old_network.remotes, + }; + + analytics.publish( + PatchNetworkAnalytics { + network_size: merged_remotes.len(), + network_has_self: merged_self.is_some(), + }, + &req, + ); + + let merged_network = DbNetwork { local: merged_self, remotes: merged_remotes }; + index_scheduler.put_network(merged_network.clone())?; + debug!(returns = ?merged_network, "Patch network"); + Ok(HttpResponse::Ok().json(merged_network)) +} diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index 90fdc9c16..3ef116dd7 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -1,3 +1,5 @@ +use std::io::ErrorKind; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebQueryParameter; @@ -16,6 +18,7 @@ use serde::Serialize; use time::format_description::well_known::Rfc3339; use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; +use tokio::io::AsyncReadExt; use tokio::task; use utoipa::{IntoParams, OpenApi, ToSchema}; @@ -44,7 +47,11 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .route(web::delete().to(SeqHandler(delete_tasks))), ) .service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks)))) - .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); + .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))) + .service( + web::resource("/{task_id}/documents") + .route(web::get().to(SeqHandler(get_task_documents_file))), + ); } #[derive(Debug, Deserr, IntoParams)] @@ -639,6 +646,76 @@ async fn get_task( } } +/// Get a task's documents. +/// +/// Get a [task's documents file](https://www.meilisearch.com/docs/learn/async/asynchronous_operations). +#[utoipa::path( + get, + path = "/{taskUid}/documents", + tag = "Tasks", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params(("taskUid", format = UInt32, example = 0, description = "The task identifier", nullable = false)), + responses( + (status = 200, description = "The content of the task update", body = serde_json::Value, content_type = "application/x-ndjson"), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + (status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Task :taskUid not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors/#task_not_found" + } + )) + ) +)] +async fn get_task_documents_file( + index_scheduler: GuardedData, Data>, + task_uid: web::Path, +) -> Result { + index_scheduler.features().check_get_task_documents_route()?; + let task_uid_string = task_uid.into_inner(); + + let task_uid: TaskId = match task_uid_string.parse() { + Ok(id) => id, + Err(_e) => { + return Err(index_scheduler::Error::InvalidTaskUid { task_uid: task_uid_string }.into()) + } + }; + + let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; + let filters = index_scheduler.filters(); + let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(&query, filters)?; + + if let Some(task) = tasks.first() { + match task.content_uuid() { + Some(uuid) => { + let mut tfile = match index_scheduler.queue.update_file(uuid) { + Ok(file) => tokio::fs::File::from_std(file), + Err(file_store::Error::IoError(e)) if e.kind() == ErrorKind::NotFound => { + return Err(index_scheduler::Error::TaskFileNotFound(task_uid).into()) + } + Err(e) => return Err(e.into()), + }; + // Yes, that's awful to put everything in memory when we could have streamed it from + // disk but it's really (really) complex to do with the current state of async Rust. + let mut content = String::new(); + tfile.read_to_string(&mut content).await?; + Ok(HttpResponse::Ok().content_type("application/x-ndjson").body(content)) + } + None => Err(index_scheduler::Error::TaskFileNotFound(task_uid).into()), + } + } else { + Err(index_scheduler::Error::TaskNotFound(task_uid).into()) + } +} + pub enum DeserializeDateOption { Before, After, diff --git a/crates/meilisearch/src/search/federated.rs b/crates/meilisearch/src/search/federated.rs deleted file mode 100644 index 1b3fa3b26..000000000 --- a/crates/meilisearch/src/search/federated.rs +++ /dev/null @@ -1,923 +0,0 @@ -use std::cmp::Ordering; -use std::collections::BTreeMap; -use std::fmt; -use std::iter::Zip; -use std::rc::Rc; -use std::str::FromStr as _; -use std::time::Duration; -use std::vec::{IntoIter, Vec}; - -use actix_http::StatusCode; -use index_scheduler::{IndexScheduler, RoFeatures}; -use indexmap::IndexMap; -use meilisearch_types::deserr::DeserrJsonError; -use meilisearch_types::error::deserr_codes::{ - InvalidMultiSearchFacetsByIndex, InvalidMultiSearchMaxValuesPerFacet, - InvalidMultiSearchMergeFacets, InvalidMultiSearchWeight, InvalidSearchLimit, - InvalidSearchOffset, -}; -use meilisearch_types::error::ResponseError; -use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::milli::score_details::{ScoreDetails, ScoreValue}; -use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget}; -use roaring::RoaringBitmap; -use serde::Serialize; -use utoipa::ToSchema; - -use super::ranking_rules::{self, RankingRules}; -use super::{ - compute_facet_distribution_stats, prepare_search, AttributesFormat, ComputedFacets, FacetStats, - HitMaker, HitsInfo, RetrieveVectors, SearchHit, SearchKind, SearchQuery, SearchQueryWithIndex, -}; -use crate::error::MeilisearchHttpError; -use crate::routes::indexes::search::search_kind; - -pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0; - -#[derive(Debug, Default, Clone, Copy, PartialEq, deserr::Deserr, ToSchema)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -pub struct FederationOptions { - #[deserr(default, error = DeserrJsonError)] - #[schema(value_type = f64)] - pub weight: Weight, -} - -#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] -#[deserr(try_from(f64) = TryFrom::try_from -> InvalidMultiSearchWeight)] -pub struct Weight(f64); - -impl Default for Weight { - fn default() -> Self { - Weight(DEFAULT_FEDERATED_WEIGHT) - } -} - -impl std::convert::TryFrom for Weight { - type Error = InvalidMultiSearchWeight; - - fn try_from(f: f64) -> Result { - if f < 0.0 { - Err(InvalidMultiSearchWeight) - } else { - Ok(Weight(f)) - } - } -} - -impl std::ops::Deref for Weight { - type Target = f64; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[derive(Debug, deserr::Deserr, ToSchema)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -#[schema(rename_all = "camelCase")] -pub struct Federation { - #[deserr(default = super::DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] - pub limit: usize, - #[deserr(default = super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] - pub offset: usize, - #[deserr(default, error = DeserrJsonError)] - pub facets_by_index: BTreeMap>>, - #[deserr(default, error = DeserrJsonError)] - pub merge_facets: Option, -} - -#[derive(Copy, Clone, Debug, deserr::Deserr, Default, ToSchema)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -#[schema(rename_all = "camelCase")] -pub struct MergeFacets { - #[deserr(default, error = DeserrJsonError)] - pub max_values_per_facet: Option, -} - -#[derive(Debug, deserr::Deserr, ToSchema)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -#[schema(rename_all = "camelCase")] -pub struct FederatedSearch { - pub queries: Vec, - #[deserr(default)] - pub federation: Option, -} - -#[derive(Serialize, Clone, ToSchema)] -#[serde(rename_all = "camelCase")] -#[schema(rename_all = "camelCase")] -pub struct FederatedSearchResult { - pub hits: Vec, - pub processing_time_ms: u128, - #[serde(flatten)] - pub hits_info: HitsInfo, - - #[serde(skip_serializing_if = "Option::is_none")] - pub semantic_hit_count: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - #[schema(value_type = Option>>)] - pub facet_distribution: Option>>, - #[serde(skip_serializing_if = "Option::is_none")] - pub facet_stats: Option>, - #[serde(skip_serializing_if = "FederatedFacets::is_empty")] - pub facets_by_index: FederatedFacets, - - // These fields are only used for analytics purposes - #[serde(skip)] - pub degraded: bool, - #[serde(skip)] - pub used_negative_operator: bool, -} - -impl fmt::Debug for FederatedSearchResult { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let FederatedSearchResult { - hits, - processing_time_ms, - hits_info, - semantic_hit_count, - degraded, - used_negative_operator, - facet_distribution, - facet_stats, - facets_by_index, - } = self; - - let mut debug = f.debug_struct("SearchResult"); - // The most important thing when looking at a search result is the time it took to process - debug.field("processing_time_ms", &processing_time_ms); - debug.field("hits", &format!("[{} hits returned]", hits.len())); - debug.field("hits_info", &hits_info); - if *used_negative_operator { - debug.field("used_negative_operator", used_negative_operator); - } - if *degraded { - debug.field("degraded", degraded); - } - if let Some(facet_distribution) = facet_distribution { - debug.field("facet_distribution", &facet_distribution); - } - if let Some(facet_stats) = facet_stats { - debug.field("facet_stats", &facet_stats); - } - if let Some(semantic_hit_count) = semantic_hit_count { - debug.field("semantic_hit_count", &semantic_hit_count); - } - if !facets_by_index.is_empty() { - debug.field("facets_by_index", &facets_by_index); - } - - debug.finish() - } -} - -struct WeightedScore<'a> { - details: &'a [ScoreDetails], - weight: f64, -} - -impl<'a> WeightedScore<'a> { - pub fn new(details: &'a [ScoreDetails], weight: f64) -> Self { - Self { details, weight } - } - - pub fn weighted_global_score(&self) -> f64 { - ScoreDetails::global_score(self.details.iter()) * self.weight - } - - pub fn compare_weighted_global_scores(&self, other: &Self) -> Ordering { - self.weighted_global_score() - .partial_cmp(&other.weighted_global_score()) - // both are numbers, possibly infinite - .unwrap() - } - - pub fn compare(&self, other: &Self) -> Ordering { - let mut left_it = ScoreDetails::score_values(self.details.iter()); - let mut right_it = ScoreDetails::score_values(other.details.iter()); - - loop { - let left = left_it.next(); - let right = right_it.next(); - - match (left, right) { - (None, None) => return Ordering::Equal, - (None, Some(_)) => return Ordering::Less, - (Some(_), None) => return Ordering::Greater, - (Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => { - let left = left * self.weight; - let right = right * other.weight; - if (left - right).abs() <= f64::EPSILON { - continue; - } - return left.partial_cmp(&right).unwrap(); - } - (Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => { - match left.partial_cmp(right) { - Some(Ordering::Equal) => continue, - Some(order) => return order, - None => return self.compare_weighted_global_scores(other), - } - } - (Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => { - match left.partial_cmp(right) { - Some(Ordering::Equal) => continue, - Some(order) => return order, - None => { - return self.compare_weighted_global_scores(other); - } - } - } - // not comparable details, use global - (Some(ScoreValue::Score(_)), Some(_)) - | (Some(_), Some(ScoreValue::Score(_))) - | (Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_))) - | (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => { - let left_count = left_it.count(); - let right_count = right_it.count(); - // compare how many remaining groups of rules each side has. - // the group with the most remaining groups wins. - return left_count - .cmp(&right_count) - // breaks ties with the global ranking score - .then_with(|| self.compare_weighted_global_scores(other)); - } - } - } - } -} - -struct QueryByIndex { - query: SearchQuery, - federation_options: FederationOptions, - query_index: usize, -} - -struct SearchResultByQuery<'a> { - documents_ids: Vec, - document_scores: Vec>, - federation_options: FederationOptions, - hit_maker: HitMaker<'a>, - query_index: usize, -} - -struct SearchResultByQueryIter<'a> { - it: Zip, IntoIter>>, - federation_options: FederationOptions, - hit_maker: Rc>, - query_index: usize, -} - -impl<'a> SearchResultByQueryIter<'a> { - fn new( - SearchResultByQuery { - documents_ids, - document_scores, - federation_options, - hit_maker, - query_index, - }: SearchResultByQuery<'a>, - ) -> Self { - let it = documents_ids.into_iter().zip(document_scores); - Self { it, federation_options, hit_maker: Rc::new(hit_maker), query_index } - } -} - -struct SearchResultByQueryIterItem<'a> { - docid: DocumentId, - score: Vec, - federation_options: FederationOptions, - hit_maker: Rc>, - query_index: usize, -} - -fn merge_index_local_results( - results_by_query: Vec>, -) -> impl Iterator + '_ { - itertools::kmerge_by( - results_by_query.into_iter().map(SearchResultByQueryIter::new), - |left: &SearchResultByQueryIterItem, right: &SearchResultByQueryIterItem| { - let left_score = WeightedScore::new(&left.score, *left.federation_options.weight); - let right_score = WeightedScore::new(&right.score, *right.federation_options.weight); - - match left_score.compare(&right_score) { - // the biggest score goes first - Ordering::Greater => true, - // break ties using query index - Ordering::Equal => left.query_index < right.query_index, - Ordering::Less => false, - } - }, - ) -} - -fn merge_index_global_results( - results_by_index: Vec, -) -> impl Iterator { - itertools::kmerge_by( - results_by_index.into_iter().map(|result_by_index| result_by_index.hits.into_iter()), - |left: &SearchHitByIndex, right: &SearchHitByIndex| { - let left_score = WeightedScore::new(&left.score, *left.federation_options.weight); - let right_score = WeightedScore::new(&right.score, *right.federation_options.weight); - - match left_score.compare(&right_score) { - // the biggest score goes first - Ordering::Greater => true, - // break ties using query index - Ordering::Equal => left.query_index < right.query_index, - Ordering::Less => false, - } - }, - ) -} - -impl<'a> Iterator for SearchResultByQueryIter<'a> { - type Item = SearchResultByQueryIterItem<'a>; - - fn next(&mut self) -> Option { - let (docid, score) = self.it.next()?; - Some(SearchResultByQueryIterItem { - docid, - score, - federation_options: self.federation_options, - hit_maker: Rc::clone(&self.hit_maker), - query_index: self.query_index, - }) - } -} - -struct SearchHitByIndex { - hit: SearchHit, - score: Vec, - federation_options: FederationOptions, - query_index: usize, -} - -struct SearchResultByIndex { - index: String, - hits: Vec, - estimated_total_hits: usize, - degraded: bool, - used_negative_operator: bool, - facets: Option, -} - -#[derive(Debug, Clone, Default, Serialize, ToSchema)] -pub struct FederatedFacets(pub BTreeMap); - -impl FederatedFacets { - pub fn insert(&mut self, index: String, facets: Option) { - if let Some(facets) = facets { - self.0.insert(index, facets); - } - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - pub fn merge( - self, - MergeFacets { max_values_per_facet }: MergeFacets, - facet_order: BTreeMap, - ) -> Option { - if self.is_empty() { - return None; - } - - let mut distribution: BTreeMap = Default::default(); - let mut stats: BTreeMap = Default::default(); - - for facets_by_index in self.0.into_values() { - for (facet, index_distribution) in facets_by_index.distribution { - match distribution.entry(facet) { - std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(index_distribution); - } - std::collections::btree_map::Entry::Occupied(mut entry) => { - let distribution = entry.get_mut(); - - for (value, index_count) in index_distribution { - distribution - .entry(value) - .and_modify(|count| *count += index_count) - .or_insert(index_count); - } - } - } - } - - for (facet, index_stats) in facets_by_index.stats { - match stats.entry(facet) { - std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(index_stats); - } - std::collections::btree_map::Entry::Occupied(mut entry) => { - let stats = entry.get_mut(); - - stats.min = f64::min(stats.min, index_stats.min); - stats.max = f64::max(stats.max, index_stats.max); - } - } - } - } - - // fixup order - for (facet, values) in &mut distribution { - let order_by = facet_order.get(facet).map(|(_, order)| *order).unwrap_or_default(); - - match order_by { - OrderBy::Lexicographic => { - values.sort_unstable_by(|left, _, right, _| left.cmp(right)) - } - OrderBy::Count => { - values.sort_unstable_by(|_, left, _, right| { - left.cmp(right) - // biggest first - .reverse() - }) - } - } - - if let Some(max_values_per_facet) = max_values_per_facet { - values.truncate(max_values_per_facet) - }; - } - - Some(ComputedFacets { distribution, stats }) - } -} - -pub fn perform_federated_search( - index_scheduler: &IndexScheduler, - queries: Vec, - mut federation: Federation, - features: RoFeatures, -) -> Result { - let before_search = std::time::Instant::now(); - - // this implementation partition the queries by index to guarantee an important property: - // - all the queries to a particular index use the same read transaction. - // This is an important property, otherwise we cannot guarantee the self-consistency of the results. - - // 1. partition queries by index - let mut queries_by_index: BTreeMap> = Default::default(); - for (query_index, federated_query) in queries.into_iter().enumerate() { - if let Some(pagination_field) = federated_query.has_pagination() { - return Err(MeilisearchHttpError::PaginationInFederatedQuery( - query_index, - pagination_field, - ) - .into()); - } - - if let Some(facets) = federated_query.has_facets() { - let facets = facets.to_owned(); - return Err(MeilisearchHttpError::FacetsInFederatedQuery( - query_index, - federated_query.index_uid.into_inner(), - facets, - ) - .into()); - } - - let (index_uid, query, federation_options) = federated_query.into_index_query_federation(); - - queries_by_index.entry(index_uid.into_inner()).or_default().push(QueryByIndex { - query, - federation_options: federation_options.unwrap_or_default(), - query_index, - }) - } - - // 2. perform queries, merge and make hits index by index - let required_hit_count = federation.limit + federation.offset; - - // In step (2), semantic_hit_count will be set to Some(0) if any search kind uses semantic - // Then in step (3), we'll update its value if there is any semantic search - let mut semantic_hit_count = None; - let mut results_by_index = Vec::with_capacity(queries_by_index.len()); - let mut previous_query_data: Option<(RankingRules, usize, String)> = None; - - // remember the order and name of first index for each facet when merging with index settings - // to detect if the order is inconsistent for a facet. - let mut facet_order: Option> = match federation.merge_facets - { - Some(MergeFacets { .. }) => Some(Default::default()), - _ => None, - }; - - for (index_uid, queries) in queries_by_index { - let first_query_index = queries.first().map(|query| query.query_index); - - let index = match index_scheduler.index(&index_uid) { - Ok(index) => index, - Err(err) => { - let mut err = ResponseError::from(err); - // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but - // here the resource not found is not part of the URL. - err.code = StatusCode::BAD_REQUEST; - if let Some(query_index) = first_query_index { - err.message = format!("Inside `.queries[{}]`: {}", query_index, err.message); - } - return Err(err); - } - }; - - // Important: this is the only transaction we'll use for this index during this federated search - let rtxn = index.read_txn()?; - - let criteria = index.criteria(&rtxn)?; - - let dictionary = index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); - let separators = index.allowed_separators(&rtxn)?; - let separators: Option> = - separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); - - // each query gets its individual cutoff - let cutoff = index.search_cutoff(&rtxn)?; - - let mut degraded = false; - let mut used_negative_operator = false; - let mut candidates = RoaringBitmap::new(); - - let facets_by_index = federation.facets_by_index.remove(&index_uid).flatten(); - - // TODO: recover the max size + facets_by_index as return value of this function so as not to ask it for all queries - if let Err(mut error) = - check_facet_order(&mut facet_order, &index_uid, &facets_by_index, &index, &rtxn) - { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {error}{}", - if let Some(query_index) = first_query_index { - format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") - } else { - Default::default() - } - ); - return Err(error); - } - - // 2.1. Compute all candidates for each query in the index - let mut results_by_query = Vec::with_capacity(queries.len()); - - for QueryByIndex { query, federation_options, query_index } in queries { - // use an immediately invoked lambda to capture the result without returning from the function - - let res: Result<(), ResponseError> = (|| { - let search_kind = - search_kind(&query, index_scheduler, index_uid.to_string(), &index)?; - - let canonicalization_kind = match (&search_kind, &query.q) { - (SearchKind::SemanticOnly { .. }, _) => { - ranking_rules::CanonicalizationKind::Vector - } - (_, Some(q)) if !q.is_empty() => ranking_rules::CanonicalizationKind::Keyword, - _ => ranking_rules::CanonicalizationKind::Placeholder, - }; - - let sort = if let Some(sort) = &query.sort { - let sorts: Vec<_> = - match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { - Ok(sorts) => sorts, - Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from( - asc_desc_error, - )) - .into()) - } - }; - Some(sorts) - } else { - None - }; - - let ranking_rules = ranking_rules::RankingRules::new( - criteria.clone(), - sort, - query.matching_strategy.into(), - canonicalization_kind, - ); - - if let Some((previous_ranking_rules, previous_query_index, previous_index_uid)) = - previous_query_data.take() - { - if let Err(error) = ranking_rules.is_compatible_with(&previous_ranking_rules) { - return Err(error.to_response_error( - &ranking_rules, - &previous_ranking_rules, - query_index, - previous_query_index, - &index_uid, - &previous_index_uid, - )); - } - previous_query_data = if previous_ranking_rules.constraint_count() - > ranking_rules.constraint_count() - { - Some((previous_ranking_rules, previous_query_index, previous_index_uid)) - } else { - Some((ranking_rules, query_index, index_uid.clone())) - }; - } else { - previous_query_data = Some((ranking_rules, query_index, index_uid.clone())); - } - - match search_kind { - SearchKind::KeywordOnly => {} - _ => semantic_hit_count = Some(0), - } - - let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors); - - let time_budget = match cutoff { - Some(cutoff) => TimeBudget::new(Duration::from_millis(cutoff)), - None => TimeBudget::default(), - }; - - let (mut search, _is_finite_pagination, _max_total_hits, _offset) = - prepare_search(&index, &rtxn, &query, &search_kind, time_budget, features)?; - - search.scoring_strategy(milli::score_details::ScoringStrategy::Detailed); - search.offset(0); - search.limit(required_hit_count); - - let (result, _semantic_hit_count) = - super::search_from_kind(index_uid.to_string(), search_kind, search)?; - let format = AttributesFormat { - attributes_to_retrieve: query.attributes_to_retrieve, - retrieve_vectors, - attributes_to_highlight: query.attributes_to_highlight, - attributes_to_crop: query.attributes_to_crop, - crop_length: query.crop_length, - crop_marker: query.crop_marker, - highlight_pre_tag: query.highlight_pre_tag, - highlight_post_tag: query.highlight_post_tag, - show_matches_position: query.show_matches_position, - sort: query.sort, - show_ranking_score: query.show_ranking_score, - show_ranking_score_details: query.show_ranking_score_details, - locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), - }; - - let milli::SearchResult { - matching_words, - candidates: query_candidates, - documents_ids, - document_scores, - degraded: query_degraded, - used_negative_operator: query_used_negative_operator, - } = result; - - candidates |= query_candidates; - degraded |= query_degraded; - used_negative_operator |= query_used_negative_operator; - - let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); - - let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - - let hit_maker = - HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { - MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) - })?; - - results_by_query.push(SearchResultByQuery { - federation_options, - hit_maker, - query_index, - documents_ids, - document_scores, - }); - Ok(()) - })(); - - if let Err(mut error) = res { - error.message = format!("Inside `.queries[{query_index}]`: {}", error.message); - return Err(error); - } - } - // 2.2. merge inside index - let mut documents_seen = RoaringBitmap::new(); - let merged_result: Result, ResponseError> = - merge_index_local_results(results_by_query) - // skip documents we've already seen & mark that we saw the current document - .filter(|SearchResultByQueryIterItem { docid, .. }| documents_seen.insert(*docid)) - .take(required_hit_count) - // 2.3 make hits - .map( - |SearchResultByQueryIterItem { - docid, - score, - federation_options, - hit_maker, - query_index, - }| { - let mut hit = hit_maker.make_hit(docid, &score)?; - let weighted_score = - ScoreDetails::global_score(score.iter()) * (*federation_options.weight); - - let _federation = serde_json::json!( - { - "indexUid": index_uid, - "queriesPosition": query_index, - "weightedRankingScore": weighted_score, - } - ); - hit.document.insert("_federation".to_string(), _federation); - Ok(SearchHitByIndex { hit, score, federation_options, query_index }) - }, - ) - .collect(); - - let merged_result = merged_result?; - - let estimated_total_hits = candidates.len() as usize; - - let facets = facets_by_index - .map(|facets_by_index| { - compute_facet_distribution_stats( - &facets_by_index, - &index, - &rtxn, - candidates, - super::Route::MultiSearch, - ) - }) - .transpose() - .map_err(|mut error| { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {}{}", - error.message, - if let Some(query_index) = first_query_index { - format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") - } else { - Default::default() - } - ); - error - })?; - - results_by_index.push(SearchResultByIndex { - index: index_uid, - hits: merged_result, - estimated_total_hits, - degraded, - used_negative_operator, - facets, - }); - } - - // bonus step, make sure to return an error if an index wants a non-faceted field, even if no query actually uses that index. - for (index_uid, facets) in federation.facets_by_index { - let index = match index_scheduler.index(&index_uid) { - Ok(index) => index, - Err(err) => { - let mut err = ResponseError::from(err); - // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but - // here the resource not found is not part of the URL. - err.code = StatusCode::BAD_REQUEST; - err.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", - err.message - ); - return Err(err); - } - }; - - // Important: this is the only transaction we'll use for this index during this federated search - let rtxn = index.read_txn()?; - - if let Err(mut error) = - check_facet_order(&mut facet_order, &index_uid, &facets, &index, &rtxn) - { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {error}\n - Note: index `{index_uid}` is not used in queries", - ); - return Err(error); - } - - if let Some(facets) = facets { - if let Err(mut error) = compute_facet_distribution_stats( - &facets, - &index, - &rtxn, - Default::default(), - super::Route::MultiSearch, - ) { - error.message = - format!("Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", error.message); - return Err(error); - } - } - } - - // 3. merge hits and metadata across indexes - // 3.1 merge metadata - let (estimated_total_hits, degraded, used_negative_operator, facets) = { - let mut estimated_total_hits = 0; - let mut degraded = false; - let mut used_negative_operator = false; - - let mut facets: FederatedFacets = FederatedFacets::default(); - - for SearchResultByIndex { - index, - hits: _, - estimated_total_hits: estimated_total_hits_by_index, - facets: facets_by_index, - degraded: degraded_by_index, - used_negative_operator: used_negative_operator_by_index, - } in &mut results_by_index - { - estimated_total_hits += *estimated_total_hits_by_index; - degraded |= *degraded_by_index; - used_negative_operator |= *used_negative_operator_by_index; - - let facets_by_index = std::mem::take(facets_by_index); - let index = std::mem::take(index); - - facets.insert(index, facets_by_index); - } - - (estimated_total_hits, degraded, used_negative_operator, facets) - }; - - // 3.2 merge hits - let merged_hits: Vec<_> = merge_index_global_results(results_by_index) - .skip(federation.offset) - .take(federation.limit) - .inspect(|hit| { - if let Some(semantic_hit_count) = &mut semantic_hit_count { - if hit.score.iter().any(|score| matches!(&score, ScoreDetails::Vector(_))) { - *semantic_hit_count += 1; - } - } - }) - .map(|hit| hit.hit) - .collect(); - - let (facet_distribution, facet_stats, facets_by_index) = - match federation.merge_facets.zip(facet_order) { - Some((merge_facets, facet_order)) => { - let facets = facets.merge(merge_facets, facet_order); - - let (facet_distribution, facet_stats) = facets - .map(|ComputedFacets { distribution, stats }| (distribution, stats)) - .unzip(); - - (facet_distribution, facet_stats, FederatedFacets::default()) - } - None => (None, None, facets), - }; - - let search_result = FederatedSearchResult { - hits: merged_hits, - processing_time_ms: before_search.elapsed().as_millis(), - hits_info: HitsInfo::OffsetLimit { - limit: federation.limit, - offset: federation.offset, - estimated_total_hits, - }, - semantic_hit_count, - degraded, - used_negative_operator, - facet_distribution, - facet_stats, - facets_by_index, - }; - - Ok(search_result) -} - -fn check_facet_order( - facet_order: &mut Option>, - current_index: &str, - facets_by_index: &Option>, - index: &milli::Index, - rtxn: &milli::heed::RoTxn<'_>, -) -> Result<(), ResponseError> { - if let (Some(facet_order), Some(facets_by_index)) = (facet_order, facets_by_index) { - let index_facet_order = index.sort_facet_values_by(rtxn)?; - for facet in facets_by_index { - let index_facet_order = index_facet_order.get(facet); - let (previous_index, previous_facet_order) = facet_order - .entry(facet.to_owned()) - .or_insert_with(|| (current_index.to_owned(), index_facet_order)); - if previous_facet_order != &index_facet_order { - return Err(MeilisearchHttpError::InconsistentFacetOrder { - facet: facet.clone(), - previous_facet_order: *previous_facet_order, - previous_uid: previous_index.clone(), - current_uid: current_index.to_owned(), - index_facet_order, - } - .into()); - } - } - }; - Ok(()) -} diff --git a/crates/meilisearch/src/search/federated/mod.rs b/crates/meilisearch/src/search/federated/mod.rs new file mode 100644 index 000000000..40204c591 --- /dev/null +++ b/crates/meilisearch/src/search/federated/mod.rs @@ -0,0 +1,10 @@ +mod perform; +mod proxy; +mod types; +mod weighted_scores; + +pub use perform::perform_federated_search; +pub use proxy::{PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE}; +pub use types::{ + FederatedSearch, FederatedSearchResult, Federation, FederationOptions, MergeFacets, +}; diff --git a/crates/meilisearch/src/search/federated/perform.rs b/crates/meilisearch/src/search/federated/perform.rs new file mode 100644 index 000000000..5ad64d63c --- /dev/null +++ b/crates/meilisearch/src/search/federated/perform.rs @@ -0,0 +1,1112 @@ +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::iter::Zip; +use std::rc::Rc; +use std::str::FromStr as _; +use std::time::{Duration, Instant}; +use std::vec::{IntoIter, Vec}; + +use actix_http::StatusCode; +use index_scheduler::{IndexScheduler, RoFeatures}; +use itertools::Itertools; +use meilisearch_types::error::ResponseError; +use meilisearch_types::features::{Network, Remote}; +use meilisearch_types::milli::order_by_map::OrderByMap; +use meilisearch_types::milli::score_details::{ScoreDetails, WeightedScoreValue}; +use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget, DEFAULT_VALUES_PER_FACET}; +use roaring::RoaringBitmap; +use tokio::task::JoinHandle; + +use super::super::ranking_rules::{self, RankingRules}; +use super::super::{ + compute_facet_distribution_stats, prepare_search, AttributesFormat, ComputedFacets, HitMaker, + HitsInfo, RetrieveVectors, SearchHit, SearchKind, SearchQuery, SearchQueryWithIndex, +}; +use super::proxy::{proxy_search, ProxySearchError, ProxySearchParams}; +use super::types::{ + FederatedFacets, FederatedSearchResult, Federation, FederationOptions, MergeFacets, Weight, + FEDERATION_HIT, FEDERATION_REMOTE, WEIGHTED_SCORE_VALUES, +}; +use super::weighted_scores; +use crate::error::MeilisearchHttpError; +use crate::routes::indexes::search::search_kind; +use crate::search::federated::types::{INDEX_UID, QUERIES_POSITION, WEIGHTED_RANKING_SCORE}; + +pub async fn perform_federated_search( + index_scheduler: &IndexScheduler, + queries: Vec, + federation: Federation, + features: RoFeatures, + is_proxy: bool, +) -> Result { + if is_proxy { + features.check_network("Performing a remote federated search")?; + } + let before_search = std::time::Instant::now(); + let deadline = before_search + std::time::Duration::from_secs(9); + + let required_hit_count = federation.limit + federation.offset; + + let network = index_scheduler.network(); + + // this implementation partition the queries by index to guarantee an important property: + // - all the queries to a particular index use the same read transaction. + // This is an important property, otherwise we cannot guarantee the self-consistency of the results. + + // 1. partition queries by host and index + let mut partitioned_queries = PartitionedQueries::new(); + for (query_index, federated_query) in queries.into_iter().enumerate() { + partitioned_queries.partition(federated_query, query_index, &network, features)? + } + + // 2. perform queries, merge and make hits index by index + // 2.1. start remote queries + let remote_search = + RemoteSearch::start(partitioned_queries.remote_queries_by_host, &federation, deadline); + + // 2.2. concurrently execute local queries + let params = SearchByIndexParams { + index_scheduler, + features, + is_proxy, + network: &network, + has_remote: partitioned_queries.has_remote, + required_hit_count, + }; + let mut search_by_index = SearchByIndex::new( + federation, + partitioned_queries.local_queries_by_index.len(), + params.has_remote, + ); + + for (index_uid, queries) in partitioned_queries.local_queries_by_index { + // note: this is the only place we open `index_uid` + search_by_index.execute(index_uid, queries, ¶ms)?; + } + + // bonus step, make sure to return an error if an index wants a non-faceted field, even if no query actually uses that index. + search_by_index.check_unused_facets(index_scheduler)?; + + let SearchByIndex { + federation, + mut semantic_hit_count, + mut results_by_index, + previous_query_data: _, + facet_order, + } = search_by_index; + + let before_waiting_remote_results = std::time::Instant::now(); + + // 2.3. Wait for proxy search requests to complete + let (mut remote_results, remote_errors) = remote_search.finish().await; + + let after_waiting_remote_results = std::time::Instant::now(); + + // 3. merge hits and metadata across indexes and hosts + // 3.1. merge metadata + let (estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration) = + merge_metadata(&mut results_by_index, &remote_results); + + // 3.2. merge hits + let merged_hits: Vec<_> = merge_index_global_results(results_by_index, &mut remote_results) + .skip(federation.offset) + .take(federation.limit) + .inspect(|hit| { + if let Some(semantic_hit_count) = &mut semantic_hit_count { + if hit.to_score().0.any(|score| matches!(&score, WeightedScoreValue::VectorSort(_))) + { + *semantic_hit_count += 1; + } + } + }) + .map(|hit| hit.hit()) + .collect(); + + // 3.3. merge facets + let (facet_distribution, facet_stats, facets_by_index) = + facet_order.merge(federation.merge_facets, remote_results, facets); + + let after_merge = std::time::Instant::now(); + + let local_duration = (before_waiting_remote_results - before_search) + + (after_merge - after_waiting_remote_results); + let max_duration = Duration::max(local_duration, max_remote_duration); + + Ok(FederatedSearchResult { + hits: merged_hits, + processing_time_ms: max_duration.as_millis(), + hits_info: HitsInfo::OffsetLimit { + limit: federation.limit, + offset: federation.offset, + estimated_total_hits, + }, + semantic_hit_count, + degraded, + used_negative_operator, + facet_distribution, + facet_stats, + facets_by_index, + remote_errors: partitioned_queries.has_remote.then_some(remote_errors), + }) +} + +struct QueryByIndex { + query: SearchQuery, + weight: Weight, + query_index: usize, +} + +struct SearchResultByQuery<'a> { + documents_ids: Vec, + document_scores: Vec>, + weight: Weight, + hit_maker: HitMaker<'a>, + query_index: usize, +} + +struct SearchResultByQueryIter<'a> { + it: Zip, IntoIter>>, + weight: Weight, + hit_maker: Rc>, + query_index: usize, +} + +impl<'a> SearchResultByQueryIter<'a> { + fn new( + SearchResultByQuery { + documents_ids, + document_scores, + weight, + hit_maker, + query_index, + }: SearchResultByQuery<'a>, + ) -> Self { + let it = documents_ids.into_iter().zip(document_scores); + Self { it, weight, hit_maker: Rc::new(hit_maker), query_index } + } +} + +struct SearchResultByQueryIterItem<'a> { + docid: DocumentId, + score: Vec, + weight: Weight, + hit_maker: Rc>, + query_index: usize, +} + +fn merge_index_local_results( + results_by_query: Vec>, +) -> impl Iterator + '_ { + itertools::kmerge_by( + results_by_query.into_iter().map(SearchResultByQueryIter::new), + |left: &SearchResultByQueryIterItem, right: &SearchResultByQueryIterItem| { + match weighted_scores::compare( + ScoreDetails::weighted_score_values(left.score.iter(), *left.weight), + ScoreDetails::global_score(left.score.iter()) * *left.weight, + ScoreDetails::weighted_score_values(right.score.iter(), *right.weight), + ScoreDetails::global_score(right.score.iter()) * *right.weight, + ) { + // the biggest score goes first + Ordering::Greater => true, + // break ties using query index + Ordering::Equal => left.query_index < right.query_index, + Ordering::Less => false, + } + }, + ) +} + +fn merge_index_global_results( + results_by_index: Vec, + remote_results: &mut [FederatedSearchResult], +) -> impl Iterator + '_ { + itertools::kmerge_by( + // local results + results_by_index + .into_iter() + .map(|result_by_index| { + either::Either::Left(result_by_index.hits.into_iter().map(MergedSearchHit::Local)) + }) + // remote results + .chain(remote_results.iter_mut().map(|x| either::Either::Right(iter_remote_hits(x)))), + |left: &MergedSearchHit, right: &MergedSearchHit| { + let (left_it, left_weighted_global_score, left_query_index) = left.to_score(); + let (right_it, right_weighted_global_score, right_query_index) = right.to_score(); + + match weighted_scores::compare( + left_it, + left_weighted_global_score, + right_it, + right_weighted_global_score, + ) { + // the biggest score goes first + Ordering::Greater => true, + // break ties using query index + Ordering::Equal => left_query_index < right_query_index, + Ordering::Less => false, + } + }, + ) +} + +enum MergedSearchHit { + Local(SearchHitByIndex), + Remote { + hit: SearchHit, + score: Vec, + global_weighted_score: f64, + query_index: usize, + }, +} + +impl MergedSearchHit { + fn remote(mut hit: SearchHit) -> Result { + let federation = hit + .document + .get_mut(FEDERATION_HIT) + .ok_or(ProxySearchError::MissingPathInResponse("._federation"))?; + let federation = match federation.as_object_mut() { + Some(federation) => federation, + None => { + return Err(ProxySearchError::UnexpectedValueInPath { + path: "._federation", + expected_type: "map", + received_value: federation.to_string(), + }); + } + }; + + let global_weighted_score = federation + .get(WEIGHTED_RANKING_SCORE) + .ok_or(ProxySearchError::MissingPathInResponse("._federation.weightedRankingScore"))?; + let global_weighted_score = global_weighted_score.as_f64().ok_or_else(|| { + ProxySearchError::UnexpectedValueInPath { + path: "._federation.weightedRankingScore", + expected_type: "number", + received_value: global_weighted_score.to_string(), + } + })?; + + let score: Vec = + serde_json::from_value(federation.remove(WEIGHTED_SCORE_VALUES).ok_or( + ProxySearchError::MissingPathInResponse("._federation.weightedScoreValues"), + )?) + .map_err(ProxySearchError::CouldNotParseWeightedScoreValues)?; + + let query_index = federation + .get(QUERIES_POSITION) + .ok_or(ProxySearchError::MissingPathInResponse("._federation.queriesPosition"))?; + let query_index = + query_index.as_u64().ok_or_else(|| ProxySearchError::UnexpectedValueInPath { + path: "._federation.queriesPosition", + expected_type: "integer", + received_value: query_index.to_string(), + })? as usize; + + Ok(Self::Remote { hit, score, global_weighted_score, query_index }) + } + + fn hit(self) -> SearchHit { + match self { + MergedSearchHit::Local(search_hit_by_index) => search_hit_by_index.hit, + MergedSearchHit::Remote { hit, .. } => hit, + } + } + + fn to_score(&self) -> (impl Iterator + '_, f64, usize) { + match self { + MergedSearchHit::Local(search_hit_by_index) => ( + either::Left(ScoreDetails::weighted_score_values( + search_hit_by_index.score.iter(), + *search_hit_by_index.weight, + )), + ScoreDetails::global_score(search_hit_by_index.score.iter()) + * *search_hit_by_index.weight, + search_hit_by_index.query_index, + ), + MergedSearchHit::Remote { hit: _, score, global_weighted_score, query_index } => { + let global_weighted_score = *global_weighted_score; + let query_index = *query_index; + (either::Right(score.iter().cloned()), global_weighted_score, query_index) + } + } + } +} + +fn iter_remote_hits( + results_by_host: &mut FederatedSearchResult, +) -> impl Iterator + '_ { + // have a per node registry of failed hits + results_by_host.hits.drain(..).filter_map(|hit| match MergedSearchHit::remote(hit) { + Ok(hit) => Some(hit), + Err(err) => { + tracing::warn!("skipping remote hit due to error: {err}"); + None + } + }) +} + +impl<'a> Iterator for SearchResultByQueryIter<'a> { + type Item = SearchResultByQueryIterItem<'a>; + + fn next(&mut self) -> Option { + let (docid, score) = self.it.next()?; + Some(SearchResultByQueryIterItem { + docid, + score, + weight: self.weight, + hit_maker: Rc::clone(&self.hit_maker), + query_index: self.query_index, + }) + } +} + +struct SearchHitByIndex { + hit: SearchHit, + score: Vec, + weight: Weight, + query_index: usize, +} + +struct SearchResultByIndex { + index: String, + hits: Vec, + estimated_total_hits: usize, + degraded: bool, + used_negative_operator: bool, + facets: Option, +} + +fn merge_metadata( + results_by_index: &mut Vec, + remote_results: &Vec, +) -> (usize, bool, bool, FederatedFacets, Duration) { + let mut estimated_total_hits = 0; + let mut degraded = false; + let mut used_negative_operator = false; + let mut facets: FederatedFacets = FederatedFacets::default(); + let mut max_remote_duration = Duration::ZERO; + for SearchResultByIndex { + index, + hits: _, + estimated_total_hits: estimated_total_hits_by_index, + facets: facets_by_index, + degraded: degraded_by_index, + used_negative_operator: used_negative_operator_by_index, + } in results_by_index + { + estimated_total_hits += *estimated_total_hits_by_index; + degraded |= *degraded_by_index; + used_negative_operator |= *used_negative_operator_by_index; + + let facets_by_index = std::mem::take(facets_by_index); + let index = std::mem::take(index); + + facets.insert(index, facets_by_index); + } + for FederatedSearchResult { + hits: _, + processing_time_ms, + hits_info, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + facets_by_index: _, + degraded: degraded_for_host, + used_negative_operator: host_used_negative_operator, + remote_errors: _, + } in remote_results + { + let this_remote_duration = Duration::from_millis(*processing_time_ms as u64); + max_remote_duration = Duration::max(this_remote_duration, max_remote_duration); + estimated_total_hits += match hits_info { + HitsInfo::Pagination { total_hits: estimated_total_hits, .. } + | HitsInfo::OffsetLimit { estimated_total_hits, .. } => estimated_total_hits, + }; + // note that because `degraded` and `used_negative_operator` are #[serde(skip)], + // `degraded_for_host` and `host_used_negative_operator` will always be false. + degraded |= degraded_for_host; + used_negative_operator |= host_used_negative_operator; + } + (estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration) +} + +type LocalQueriesByIndex = BTreeMap>; +type RemoteQueriesByHost = BTreeMap)>; + +struct PartitionedQueries { + local_queries_by_index: LocalQueriesByIndex, + remote_queries_by_host: RemoteQueriesByHost, + has_remote: bool, +} + +impl PartitionedQueries { + fn new() -> PartitionedQueries { + PartitionedQueries { + local_queries_by_index: Default::default(), + remote_queries_by_host: Default::default(), + has_remote: false, + } + } + + fn partition( + &mut self, + federated_query: SearchQueryWithIndex, + query_index: usize, + network: &Network, + features: RoFeatures, + ) -> Result<(), ResponseError> { + if let Some(pagination_field) = federated_query.has_pagination() { + return Err(MeilisearchHttpError::PaginationInFederatedQuery( + query_index, + pagination_field, + ) + .into()); + } + + if let Some(facets) = federated_query.has_facets() { + let facets = facets.to_owned(); + return Err(MeilisearchHttpError::FacetsInFederatedQuery( + query_index, + federated_query.index_uid.into_inner(), + facets, + ) + .into()); + } + + let (index_uid, query, federation_options) = federated_query.into_index_query_federation(); + + let federation_options = federation_options.unwrap_or_default(); + + // local or remote node? + 'local_query: { + let queries_by_index = match federation_options.remote { + None => self.local_queries_by_index.entry(index_uid.into_inner()).or_default(), + Some(remote_name) => { + self.has_remote = true; + features.check_network("Performing a remote federated search")?; + + match &network.local { + Some(local) if local == &remote_name => { + self.local_queries_by_index.entry(index_uid.into_inner()).or_default() + } + _ => { + // node from the network + let Some(remote) = network.remotes.get(&remote_name) else { + return Err(ResponseError::from_msg(format!("Invalid `queries[{query_index}].federation_options.remote`: remote `{remote_name}` is not registered"), + meilisearch_types::error::Code::InvalidMultiSearchRemote)); + }; + let query = SearchQueryWithIndex::from_index_query_federation( + index_uid, + query, + Some(FederationOptions { + weight: federation_options.weight, + // do not pass the `remote` to not require the remote instance to have itself has a local node + remote: None, + // pass an explicit query index + query_position: Some(query_index), + }), + ); + + self.remote_queries_by_host + .entry(remote_name) + .or_insert_with(|| (remote.clone(), Default::default())) + .1 + .push(query); + break 'local_query; + } + } + } + }; + + queries_by_index.push(QueryByIndex { + query, + weight: federation_options.weight, + // override query index here with the one in federation. + // this will fix-up error messages to refer to the global query index of the original request. + query_index: if let Some(query_index) = federation_options.query_position { + features.check_network("Using `federationOptions.queryPosition`")?; + query_index + } else { + query_index + }, + }) + } + Ok(()) + } +} + +struct RemoteSearch { + in_flight_remote_queries: + BTreeMap>>, +} + +impl RemoteSearch { + fn start(queries: RemoteQueriesByHost, federation: &Federation, deadline: Instant) -> Self { + let mut in_flight_remote_queries = BTreeMap::new(); + let client = reqwest::ClientBuilder::new() + .connect_timeout(std::time::Duration::from_millis(200)) + .build() + .unwrap(); + let params = + ProxySearchParams { deadline: Some(deadline), try_count: 3, client: client.clone() }; + for (node_name, (node, queries)) in queries { + // spawn one task per host + in_flight_remote_queries.insert( + node_name, + tokio::spawn({ + let mut proxy_federation = federation.clone(); + // fixup limit and offset to not apply them twice + proxy_federation.limit = federation.limit + federation.offset; + proxy_federation.offset = 0; + // never merge distant facets + proxy_federation.merge_facets = None; + let params = params.clone(); + async move { proxy_search(&node, queries, proxy_federation, ¶ms).await } + }), + ); + } + Self { in_flight_remote_queries } + } + + async fn finish(self) -> (Vec, BTreeMap) { + let mut remote_results = Vec::with_capacity(self.in_flight_remote_queries.len()); + let mut remote_errors: BTreeMap = BTreeMap::new(); + 'remote_queries: for (node_name, handle) in self.in_flight_remote_queries { + match handle.await { + Ok(Ok(mut res)) => { + for hit in &mut res.hits { + let Some(federation) = hit.document.get_mut(FEDERATION_HIT) else { + let error = ProxySearchError::MissingPathInResponse("._federation"); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + }; + let Some(federation) = federation.as_object_mut() else { + let error = ProxySearchError::UnexpectedValueInPath { + path: "._federation", + expected_type: "map", + received_value: federation.to_string(), + }; + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + }; + if !federation.contains_key(WEIGHTED_SCORE_VALUES) { + let error = ProxySearchError::MissingPathInResponse( + "._federation.weightedScoreValues", + ); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + } + + if !federation.contains_key(WEIGHTED_RANKING_SCORE) { + let error = ProxySearchError::MissingPathInResponse( + "._federation.weightedRankingScore", + ); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + } + + federation.insert( + FEDERATION_REMOTE.to_string(), + serde_json::Value::String(node_name.clone()), + ); + } + + remote_results.push(res); + } + Ok(Err(error)) => { + remote_errors.insert(node_name, error.as_response_error()); + } + Err(panic) => match panic.try_into_panic() { + Ok(panic) => { + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + remote_errors.insert( + node_name, + ResponseError::from_msg( + msg.to_string(), + meilisearch_types::error::Code::Internal, + ), + ); + } + Err(_) => tracing::error!("proxy search task was unexpectedly cancelled"), + }, + } + } + (remote_results, remote_errors) + } +} + +struct SearchByIndexParams<'a> { + index_scheduler: &'a IndexScheduler, + required_hit_count: usize, + features: RoFeatures, + is_proxy: bool, + has_remote: bool, + network: &'a Network, +} + +struct SearchByIndex { + federation: Federation, + // During search by index, semantic_hit_count will be set to Some(0) if any search kind uses semantic + // Then when merging, we'll update its value if there is any semantic hit + semantic_hit_count: Option, + results_by_index: Vec, + previous_query_data: Option<(RankingRules, usize, String)>, + // remember the order and name of first index for each facet when merging with index settings + // to detect if the order is inconsistent for a facet. + facet_order: FacetOrder, +} + +impl SearchByIndex { + fn new(federation: Federation, index_count: usize, has_remote: bool) -> Self { + SearchByIndex { + facet_order: match (federation.merge_facets, has_remote) { + (None, true) => FacetOrder::ByIndex(Default::default()), + (None, false) => FacetOrder::None, + (Some(_), _) => FacetOrder::ByFacet(Default::default()), + }, + federation, + semantic_hit_count: None, + results_by_index: Vec::with_capacity(index_count), + previous_query_data: None, + } + } + + fn execute( + &mut self, + index_uid: String, + queries: Vec, + params: &SearchByIndexParams<'_>, + ) -> Result<(), ResponseError> { + let first_query_index = queries.first().map(|query| query.query_index); + let index = match params.index_scheduler.index(&index_uid) { + Ok(index) => index, + Err(err) => { + let mut err = ResponseError::from(err); + // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but + // here the resource not found is not part of the URL. + err.code = StatusCode::BAD_REQUEST; + if let Some(query_index) = first_query_index { + err.message = format!("Inside `.queries[{}]`: {}", query_index, err.message); + } + return Err(err); + } + }; + let rtxn = index.read_txn()?; + let criteria = index.criteria(&rtxn)?; + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let separators = index.allowed_separators(&rtxn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let cutoff = index.search_cutoff(&rtxn)?; + let mut degraded = false; + let mut used_negative_operator = false; + let mut candidates = RoaringBitmap::new(); + let facets_by_index = self.federation.facets_by_index.remove(&index_uid).flatten(); + if let Err(mut error) = + self.facet_order.check_facet_order(&index_uid, &facets_by_index, &index, &rtxn) + { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {error}{}", + if let Some(query_index) = first_query_index { + format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") + } else { + Default::default() + } + ); + return Err(error); + } + let mut results_by_query = Vec::with_capacity(queries.len()); + for QueryByIndex { query, weight, query_index } in queries { + // use an immediately invoked lambda to capture the result without returning from the function + + let res: Result<(), ResponseError> = (|| { + let search_kind = + search_kind(&query, params.index_scheduler, index_uid.to_string(), &index)?; + + let canonicalization_kind = match (&search_kind, &query.q) { + (SearchKind::SemanticOnly { .. }, _) => { + ranking_rules::CanonicalizationKind::Vector + } + (_, Some(q)) if !q.is_empty() => ranking_rules::CanonicalizationKind::Keyword, + _ => ranking_rules::CanonicalizationKind::Placeholder, + }; + + let sort = if let Some(sort) = &query.sort { + let sorts: Vec<_> = + match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { + Ok(sorts) => sorts, + Err(asc_desc_error) => { + return Err(milli::Error::from(milli::SortError::from( + asc_desc_error, + )) + .into()) + } + }; + Some(sorts) + } else { + None + }; + + let ranking_rules = ranking_rules::RankingRules::new( + criteria.clone(), + sort, + query.matching_strategy.into(), + canonicalization_kind, + ); + + if let Some((previous_ranking_rules, previous_query_index, previous_index_uid)) = + self.previous_query_data.take() + { + if let Err(error) = ranking_rules.is_compatible_with(&previous_ranking_rules) { + return Err(error.to_response_error( + &ranking_rules, + &previous_ranking_rules, + query_index, + previous_query_index, + &index_uid, + &previous_index_uid, + )); + } + self.previous_query_data = if previous_ranking_rules.constraint_count() + > ranking_rules.constraint_count() + { + Some((previous_ranking_rules, previous_query_index, previous_index_uid)) + } else { + Some((ranking_rules, query_index, index_uid.clone())) + }; + } else { + self.previous_query_data = + Some((ranking_rules, query_index, index_uid.clone())); + } + + match search_kind { + SearchKind::KeywordOnly => {} + _ => self.semantic_hit_count = Some(0), + } + + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors); + + let time_budget = match cutoff { + Some(cutoff) => TimeBudget::new(Duration::from_millis(cutoff)), + None => TimeBudget::default(), + }; + + let (mut search, _is_finite_pagination, _max_total_hits, _offset) = prepare_search( + &index, + &rtxn, + &query, + &search_kind, + time_budget, + params.features, + )?; + + search.scoring_strategy(milli::score_details::ScoringStrategy::Detailed); + search.offset(0); + search.limit(params.required_hit_count); + + let (result, _semantic_hit_count) = + super::super::search_from_kind(index_uid.to_string(), search_kind, search)?; + let format = AttributesFormat { + attributes_to_retrieve: query.attributes_to_retrieve, + retrieve_vectors, + attributes_to_highlight: query.attributes_to_highlight, + attributes_to_crop: query.attributes_to_crop, + crop_length: query.crop_length, + crop_marker: query.crop_marker, + highlight_pre_tag: query.highlight_pre_tag, + highlight_post_tag: query.highlight_post_tag, + show_matches_position: query.show_matches_position, + sort: query.sort, + show_ranking_score: query.show_ranking_score, + show_ranking_score_details: query.show_ranking_score_details, + locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), + }; + + let milli::SearchResult { + matching_words, + candidates: query_candidates, + documents_ids, + document_scores, + degraded: query_degraded, + used_negative_operator: query_used_negative_operator, + } = result; + + candidates |= query_candidates; + degraded |= query_degraded; + used_negative_operator |= query_used_negative_operator; + + let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); + + let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); + + let hit_maker = + HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { + MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) + })?; + + results_by_query.push(SearchResultByQuery { + weight, + hit_maker, + query_index, + documents_ids, + document_scores, + }); + Ok(()) + })(); + + if let Err(mut error) = res { + error.message = format!("Inside `.queries[{query_index}]`: {}", error.message); + return Err(error); + } + } + let mut documents_seen = RoaringBitmap::new(); + let merged_result: Result, ResponseError> = + merge_index_local_results(results_by_query) + // skip documents we've already seen & mark that we saw the current document + .filter(|SearchResultByQueryIterItem { docid, .. }| documents_seen.insert(*docid)) + .take(params.required_hit_count) + // 2.3 make hits + .map( + |SearchResultByQueryIterItem { + docid, + score, + weight, + hit_maker, + query_index, + }| { + let mut hit = hit_maker.make_hit(docid, &score)?; + let weighted_score = ScoreDetails::global_score(score.iter()) * (*weight); + + let mut _federation = serde_json::json!( + { + INDEX_UID: index_uid, + QUERIES_POSITION: query_index, + WEIGHTED_RANKING_SCORE: weighted_score, + } + ); + if params.has_remote && !params.is_proxy { + _federation.as_object_mut().unwrap().insert( + FEDERATION_REMOTE.to_string(), + params.network.local.clone().into(), + ); + } + if params.is_proxy { + _federation.as_object_mut().unwrap().insert( + WEIGHTED_SCORE_VALUES.to_string(), + serde_json::json!(ScoreDetails::weighted_score_values( + score.iter(), + *weight + ) + .collect_vec()), + ); + } + hit.document.insert(FEDERATION_HIT.to_string(), _federation); + Ok(SearchHitByIndex { hit, score, weight, query_index }) + }, + ) + .collect(); + let merged_result = merged_result?; + let estimated_total_hits = candidates.len() as usize; + let facets = facets_by_index + .map(|facets_by_index| { + compute_facet_distribution_stats( + &facets_by_index, + &index, + &rtxn, + candidates, + super::super::Route::MultiSearch, + ) + }) + .transpose() + .map_err(|mut error| { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {}{}", + error.message, + if let Some(query_index) = first_query_index { + format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") + } else { + Default::default() + } + ); + error + })?; + self.results_by_index.push(SearchResultByIndex { + index: index_uid, + hits: merged_result, + estimated_total_hits, + degraded, + used_negative_operator, + facets, + }); + Ok(()) + } + + fn check_unused_facets( + &mut self, + index_scheduler: &IndexScheduler, + ) -> Result<(), ResponseError> { + for (index_uid, facets) in std::mem::take(&mut self.federation.facets_by_index) { + let index = match index_scheduler.index(&index_uid) { + Ok(index) => index, + Err(err) => { + let mut err = ResponseError::from(err); + // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but + // here the resource not found is not part of the URL. + err.code = StatusCode::BAD_REQUEST; + err.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", + err.message + ); + return Err(err); + } + }; + + // Important: this is the only transaction we'll use for this index during this federated search + let rtxn = index.read_txn()?; + + if let Err(mut error) = + self.facet_order.check_facet_order(&index_uid, &facets, &index, &rtxn) + { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {error}\n - Note: index `{index_uid}` is not used in queries", + ); + return Err(error); + } + + if let Some(facets) = facets { + if let Err(mut error) = compute_facet_distribution_stats( + &facets, + &index, + &rtxn, + Default::default(), + super::super::Route::MultiSearch, + ) { + error.message = + format!("Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", error.message); + return Err(error); + } + } + } + Ok(()) + } +} + +enum FacetOrder { + /// The order is stored by facet to be able to merge facets regardless of index of origin + /// + /// - key: facet name + /// - value: (first_index_name, first_index_order) + /// + /// We store the name of the first index where the facet is present as well as its order, + /// so that if encountering the same facet in a different index we can compare the order and send + /// a readable error. + ByFacet(BTreeMap), + /// The order is stored by index to be able to merge facets regardless of the remote of origin. + /// + /// This variant is only used when `is_remote = true`, and always used in that case. + /// + /// - key: index name + /// - value: (order_by_map, max_values_per_facet) + /// + /// We store a map of the order per facet for that index, as well as the max values per facet. + /// Both are retrieved from the settings of the local version of the index. + /// + /// It is not possible to have an index only existing in the remotes, because as of now all indexes that appear + /// in `federation.facetsByIndex` must exist on all hosts. + ByIndex(BTreeMap), + /// Do not merge facets. Used when `federation.mergeFacets = null` and `!has_remote` + None, +} + +type FacetDistributions = BTreeMap>; +type FacetStats = BTreeMap; + +impl FacetOrder { + fn check_facet_order( + &mut self, + current_index: &str, + facets_by_index: &Option>, + index: &milli::Index, + rtxn: &milli::heed::RoTxn<'_>, + ) -> Result<(), ResponseError> { + match self { + FacetOrder::ByFacet(facet_order) => { + if let Some(facets_by_index) = facets_by_index { + let index_facet_order = index.sort_facet_values_by(rtxn)?; + for facet in facets_by_index { + let index_facet_order = index_facet_order.get(facet); + let (previous_index, previous_facet_order) = facet_order + .entry(facet.to_owned()) + .or_insert_with(|| (current_index.to_owned(), index_facet_order)); + if previous_facet_order != &index_facet_order { + return Err(MeilisearchHttpError::InconsistentFacetOrder { + facet: facet.clone(), + previous_facet_order: *previous_facet_order, + previous_uid: previous_index.clone(), + current_uid: current_index.to_owned(), + index_facet_order, + } + .into()); + } + } + } + } + FacetOrder::ByIndex(order_by_index) => { + let max_values_per_facet = index + .max_values_per_facet(rtxn)? + .map(|x| x as usize) + .unwrap_or(DEFAULT_VALUES_PER_FACET); + order_by_index.insert( + current_index.to_owned(), + (index.sort_facet_values_by(rtxn)?, max_values_per_facet), + ); + } + FacetOrder::None => {} + } + Ok(()) + } + + fn merge( + self, + merge_facets: Option, + remote_results: Vec, + mut facets: FederatedFacets, + ) -> (Option, Option, FederatedFacets) { + let (facet_distribution, facet_stats, facets_by_index) = match (self, merge_facets) { + (FacetOrder::ByFacet(facet_order), Some(merge_facets)) => { + for remote_facets_by_index in + remote_results.into_iter().map(|result| result.facets_by_index) + { + facets.append(remote_facets_by_index); + } + let facets = facets.merge(merge_facets, facet_order); + + let (facet_distribution, facet_stats) = facets + .map(|ComputedFacets { distribution, stats }| (distribution, stats)) + .unzip(); + + (facet_distribution, facet_stats, FederatedFacets::default()) + } + (FacetOrder::ByIndex(facet_order), _) => { + for remote_facets_by_index in + remote_results.into_iter().map(|result| result.facets_by_index) + { + facets.append(remote_facets_by_index); + } + facets.sort_and_truncate(facet_order); + (None, None, facets) + } + _ => (None, None, facets), + }; + (facet_distribution, facet_stats, facets_by_index) + } +} diff --git a/crates/meilisearch/src/search/federated/proxy.rs b/crates/meilisearch/src/search/federated/proxy.rs new file mode 100644 index 000000000..bf954693c --- /dev/null +++ b/crates/meilisearch/src/search/federated/proxy.rs @@ -0,0 +1,267 @@ +pub use error::ProxySearchError; +use error::ReqwestErrorWithoutUrl; +use meilisearch_types::features::Remote; +use rand::Rng as _; +use reqwest::{Client, Response, StatusCode}; +use serde::de::DeserializeOwned; +use serde_json::Value; + +use super::types::{FederatedSearch, FederatedSearchResult, Federation}; +use crate::search::SearchQueryWithIndex; + +pub const PROXY_SEARCH_HEADER: &str = "Meili-Proxy-Search"; +pub const PROXY_SEARCH_HEADER_VALUE: &str = "true"; + +mod error { + use meilisearch_types::error::ResponseError; + use reqwest::StatusCode; + + #[derive(Debug, thiserror::Error)] + pub enum ProxySearchError { + #[error("{0}")] + CouldNotSendRequest(ReqwestErrorWithoutUrl), + #[error("could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action")] + AuthenticationError, + #[error( + "could not parse response from the remote host as a federated search response{}\n - hint: check that the remote instance is a Meilisearch instance running the same version", + response_from_remote(response) + )] + CouldNotParseResponse { response: Result }, + #[error("remote host responded with code {}{}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", status_code.as_u16(), response_from_remote(response))] + BadRequest { status_code: StatusCode, response: Result }, + #[error("remote host did not answer before the deadline")] + Timeout, + #[error("remote hit does not contain `{0}`\n - hint: check that the remote instance is a Meilisearch instance running the same version")] + MissingPathInResponse(&'static str), + #[error("remote host responded with code {}{}", status_code.as_u16(), response_from_remote(response))] + RemoteError { status_code: StatusCode, response: Result }, + #[error("remote hit contains an unexpected value at path `{path}`: expected {expected_type}, received `{received_value}`\n - hint: check that the remote instance is a Meilisearch instance running the same version")] + UnexpectedValueInPath { + path: &'static str, + expected_type: &'static str, + received_value: String, + }, + #[error("could not parse weighted score values in the remote hit: {0}")] + CouldNotParseWeightedScoreValues(serde_json::Error), + } + + impl ProxySearchError { + pub fn as_response_error(&self) -> ResponseError { + use meilisearch_types::error::Code; + let message = self.to_string(); + let code = match self { + ProxySearchError::CouldNotSendRequest(_) => Code::RemoteCouldNotSendRequest, + ProxySearchError::AuthenticationError => Code::RemoteInvalidApiKey, + ProxySearchError::BadRequest { .. } => Code::RemoteBadRequest, + ProxySearchError::Timeout => Code::RemoteTimeout, + ProxySearchError::RemoteError { .. } => Code::RemoteRemoteError, + ProxySearchError::CouldNotParseResponse { .. } + | ProxySearchError::MissingPathInResponse(_) + | ProxySearchError::UnexpectedValueInPath { .. } + | ProxySearchError::CouldNotParseWeightedScoreValues(_) => Code::RemoteBadResponse, + }; + ResponseError::from_msg(message, code) + } + } + + #[derive(Debug, thiserror::Error)] + #[error(transparent)] + pub struct ReqwestErrorWithoutUrl(reqwest::Error); + impl ReqwestErrorWithoutUrl { + pub fn new(inner: reqwest::Error) -> Self { + Self(inner.without_url()) + } + } + + fn response_from_remote(response: &Result) -> String { + match response { + Ok(response) => { + format!(":\n - response from remote: {}", response) + } + Err(error) => { + format!(":\n - additionally, could not retrieve response from remote: {error}") + } + } + } +} + +#[derive(Clone)] +pub struct ProxySearchParams { + pub deadline: Option, + pub try_count: u32, + pub client: reqwest::Client, +} + +/// Performs a federated search on a remote host and returns the results +pub async fn proxy_search( + node: &Remote, + queries: Vec, + federation: Federation, + params: &ProxySearchParams, +) -> Result { + let url = format!("{}/multi-search", node.url); + + let federated = FederatedSearch { queries, federation: Some(federation) }; + + let search_api_key = node.search_api_key.as_deref(); + + let max_deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + + let deadline = if let Some(deadline) = params.deadline { + std::time::Instant::min(deadline, max_deadline) + } else { + max_deadline + }; + + for i in 0..params.try_count { + match try_proxy_search(&url, search_api_key, &federated, ¶ms.client, deadline).await { + Ok(response) => return Ok(response), + Err(retry) => { + let duration = retry.into_duration(i)?; + tokio::time::sleep(duration).await; + } + } + } + try_proxy_search(&url, search_api_key, &federated, ¶ms.client, deadline) + .await + .map_err(Retry::into_error) +} + +async fn try_proxy_search( + url: &str, + search_api_key: Option<&str>, + federated: &FederatedSearch, + client: &Client, + deadline: std::time::Instant, +) -> Result { + let timeout = deadline.saturating_duration_since(std::time::Instant::now()); + + let request = client.post(url).json(&federated).timeout(timeout); + let request = if let Some(search_api_key) = search_api_key { + request.bearer_auth(search_api_key) + } else { + request + }; + let request = request.header(PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE); + + let response = request.send().await; + let response = match response { + Ok(response) => response, + Err(error) if error.is_timeout() => return Err(Retry::give_up(ProxySearchError::Timeout)), + Err(error) => { + return Err(Retry::retry_later(ProxySearchError::CouldNotSendRequest( + ReqwestErrorWithoutUrl::new(error), + ))) + } + }; + + match response.status() { + status_code if status_code.is_success() => (), + StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { + return Err(Retry::give_up(ProxySearchError::AuthenticationError)) + } + status_code if status_code.is_client_error() => { + let response = parse_error(response).await; + return Err(Retry::give_up(ProxySearchError::BadRequest { status_code, response })); + } + status_code if status_code.is_server_error() => { + let response = parse_error(response).await; + return Err(Retry::retry_later(ProxySearchError::RemoteError { + status_code, + response, + })); + } + status_code => { + tracing::warn!( + status_code = status_code.as_u16(), + "remote replied with unexpected status code" + ); + } + } + + let response = match parse_response(response).await { + Ok(response) => response, + Err(response) => { + return Err(Retry::retry_later(ProxySearchError::CouldNotParseResponse { response })) + } + }; + + Ok(response) +} + +/// Always parse the body of the response of a failed request as JSON. +async fn parse_error(response: Response) -> Result { + let bytes = match response.bytes().await { + Ok(bytes) => bytes, + Err(error) => return Err(ReqwestErrorWithoutUrl::new(error)), + }; + + Ok(parse_bytes_as_error(&bytes)) +} + +fn parse_bytes_as_error(bytes: &[u8]) -> String { + match serde_json::from_slice::(bytes) { + Ok(value) => value.to_string(), + Err(_) => String::from_utf8_lossy(bytes).into_owned(), + } +} + +async fn parse_response( + response: Response, +) -> Result> { + let bytes = match response.bytes().await { + Ok(bytes) => bytes, + Err(error) => return Err(Err(ReqwestErrorWithoutUrl::new(error))), + }; + + match serde_json::from_slice::(&bytes) { + Ok(value) => Ok(value), + Err(_) => Err(Ok(parse_bytes_as_error(&bytes))), + } +} + +pub struct Retry { + error: ProxySearchError, + strategy: RetryStrategy, +} + +pub enum RetryStrategy { + GiveUp, + Retry, +} + +impl Retry { + pub fn give_up(error: ProxySearchError) -> Self { + Self { error, strategy: RetryStrategy::GiveUp } + } + + pub fn retry_later(error: ProxySearchError) -> Self { + Self { error, strategy: RetryStrategy::Retry } + } + + pub fn into_duration(self, attempt: u32) -> Result { + match self.strategy { + RetryStrategy::GiveUp => Err(self.error), + RetryStrategy::Retry => { + let retry_duration = std::time::Duration::from_nanos((10u64).pow(attempt)); + let retry_duration = retry_duration.min(std::time::Duration::from_millis(100)); // don't wait more than 100ms + + // randomly up to double the retry duration + let retry_duration = retry_duration + + rand::thread_rng().gen_range(std::time::Duration::ZERO..retry_duration); + + tracing::warn!( + "Attempt #{}, failed with {}, retrying after {}ms.", + attempt, + self.error, + retry_duration.as_millis() + ); + Ok(retry_duration) + } + } + } + + pub fn into_error(self) -> ProxySearchError { + self.error + } +} diff --git a/crates/meilisearch/src/search/federated/types.rs b/crates/meilisearch/src/search/federated/types.rs new file mode 100644 index 000000000..804df8d31 --- /dev/null +++ b/crates/meilisearch/src/search/federated/types.rs @@ -0,0 +1,322 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::fmt; +use std::vec::Vec; + +use indexmap::IndexMap; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::{ + InvalidMultiSearchFacetsByIndex, InvalidMultiSearchMaxValuesPerFacet, + InvalidMultiSearchMergeFacets, InvalidMultiSearchQueryPosition, InvalidMultiSearchRemote, + InvalidMultiSearchWeight, InvalidSearchLimit, InvalidSearchOffset, +}; +use meilisearch_types::error::ResponseError; +use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::milli::order_by_map::OrderByMap; +use meilisearch_types::milli::OrderBy; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +use super::super::{ComputedFacets, FacetStats, HitsInfo, SearchHit, SearchQueryWithIndex}; + +pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0; + +// fields in the response +pub const FEDERATION_HIT: &str = "_federation"; +pub const INDEX_UID: &str = "indexUid"; +pub const QUERIES_POSITION: &str = "queriesPosition"; +pub const WEIGHTED_RANKING_SCORE: &str = "weightedRankingScore"; +pub const WEIGHTED_SCORE_VALUES: &str = "weightedScoreValues"; +pub const FEDERATION_REMOTE: &str = "remote"; + +#[derive(Debug, Default, Clone, PartialEq, Serialize, deserr::Deserr, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] + +pub struct FederationOptions { + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = f64)] + pub weight: Weight, + + #[deserr(default, error = DeserrJsonError)] + pub remote: Option, + + #[deserr(default, error = DeserrJsonError)] + pub query_position: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, deserr::Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidMultiSearchWeight)] +pub struct Weight(f64); + +impl Default for Weight { + fn default() -> Self { + Weight(DEFAULT_FEDERATED_WEIGHT) + } +} + +impl std::convert::TryFrom for Weight { + type Error = InvalidMultiSearchWeight; + + fn try_from(f: f64) -> Result { + if f < 0.0 { + Err(InvalidMultiSearchWeight) + } else { + Ok(Weight(f)) + } + } +} + +impl std::ops::Deref for Weight { + type Target = f64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(Debug, Clone, deserr::Deserr, Serialize, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct Federation { + #[deserr(default = super::super::DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] + pub limit: usize, + #[deserr(default = super::super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] + pub offset: usize, + #[deserr(default, error = DeserrJsonError)] + pub facets_by_index: BTreeMap>>, + #[deserr(default, error = DeserrJsonError)] + pub merge_facets: Option, +} + +#[derive(Copy, Clone, Debug, deserr::Deserr, Serialize, Default, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct MergeFacets { + #[deserr(default, error = DeserrJsonError)] + pub max_values_per_facet: Option, +} + +#[derive(Debug, deserr::Deserr, Serialize, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct FederatedSearch { + pub queries: Vec, + #[deserr(default)] + pub federation: Option, +} + +#[derive(Serialize, Deserialize, Clone, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct FederatedSearchResult { + pub hits: Vec, + pub processing_time_ms: u128, + #[serde(flatten)] + pub hits_info: HitsInfo, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub semantic_hit_count: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + #[schema(value_type = Option>>)] + pub facet_distribution: Option>>, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub facet_stats: Option>, + #[serde(default, skip_serializing_if = "FederatedFacets::is_empty")] + pub facets_by_index: FederatedFacets, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub remote_errors: Option>, + + // These fields are only used for analytics purposes + #[serde(skip)] + pub degraded: bool, + #[serde(skip)] + pub used_negative_operator: bool, +} + +impl fmt::Debug for FederatedSearchResult { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FederatedSearchResult { + hits, + processing_time_ms, + hits_info, + semantic_hit_count, + degraded, + used_negative_operator, + facet_distribution, + facet_stats, + facets_by_index, + remote_errors, + } = self; + + let mut debug = f.debug_struct("SearchResult"); + // The most important thing when looking at a search result is the time it took to process + debug.field("processing_time_ms", &processing_time_ms); + debug.field("hits", &format!("[{} hits returned]", hits.len())); + debug.field("hits_info", &hits_info); + if *used_negative_operator { + debug.field("used_negative_operator", used_negative_operator); + } + if *degraded { + debug.field("degraded", degraded); + } + if let Some(facet_distribution) = facet_distribution { + debug.field("facet_distribution", &facet_distribution); + } + if let Some(facet_stats) = facet_stats { + debug.field("facet_stats", &facet_stats); + } + if let Some(semantic_hit_count) = semantic_hit_count { + debug.field("semantic_hit_count", &semantic_hit_count); + } + if !facets_by_index.is_empty() { + debug.field("facets_by_index", &facets_by_index); + } + if let Some(remote_errors) = remote_errors { + debug.field("remote_errors", &remote_errors); + } + + debug.finish() + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] +pub struct FederatedFacets(pub BTreeMap); + +impl FederatedFacets { + pub fn insert(&mut self, index: String, facets: Option) { + if let Some(facets) = facets { + self.0.insert(index, facets); + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn merge( + self, + MergeFacets { max_values_per_facet }: MergeFacets, + facet_order: BTreeMap, + ) -> Option { + if self.is_empty() { + return None; + } + + let mut distribution: BTreeMap = Default::default(); + let mut stats: BTreeMap = Default::default(); + + for facets_by_index in self.0.into_values() { + for (facet, index_distribution) in facets_by_index.distribution { + match distribution.entry(facet) { + Entry::Vacant(entry) => { + entry.insert(index_distribution); + } + Entry::Occupied(mut entry) => { + let distribution = entry.get_mut(); + + for (value, index_count) in index_distribution { + distribution + .entry(value) + .and_modify(|count| *count += index_count) + .or_insert(index_count); + } + } + } + } + + for (facet, index_stats) in facets_by_index.stats { + match stats.entry(facet) { + Entry::Vacant(entry) => { + entry.insert(index_stats); + } + Entry::Occupied(mut entry) => { + let stats = entry.get_mut(); + + stats.min = f64::min(stats.min, index_stats.min); + stats.max = f64::max(stats.max, index_stats.max); + } + } + } + } + + // fixup order + for (facet, values) in &mut distribution { + let order_by = facet_order.get(facet).map(|(_, order)| *order).unwrap_or_default(); + + match order_by { + OrderBy::Lexicographic => { + values.sort_unstable_by(|left, _, right, _| left.cmp(right)) + } + OrderBy::Count => { + values.sort_unstable_by(|_, left, _, right| { + left.cmp(right) + // biggest first + .reverse() + }) + } + } + + if let Some(max_values_per_facet) = max_values_per_facet { + values.truncate(max_values_per_facet) + }; + } + + Some(ComputedFacets { distribution, stats }) + } + + pub(crate) fn append(&mut self, FederatedFacets(remote_facets_by_index): FederatedFacets) { + for (index, remote_facets) in remote_facets_by_index { + let merged_facets = self.0.entry(index).or_default(); + + for (remote_facet, remote_stats) in remote_facets.stats { + match merged_facets.stats.entry(remote_facet) { + Entry::Vacant(vacant_entry) => { + vacant_entry.insert(remote_stats); + } + Entry::Occupied(mut occupied_entry) => { + let stats = occupied_entry.get_mut(); + stats.min = f64::min(stats.min, remote_stats.min); + stats.max = f64::max(stats.max, remote_stats.max); + } + } + } + + for (remote_facet, remote_values) in remote_facets.distribution { + let merged_facet = merged_facets.distribution.entry(remote_facet).or_default(); + for (remote_value, remote_count) in remote_values { + let count = merged_facet.entry(remote_value).or_default(); + *count += remote_count; + } + } + } + } + + pub fn sort_and_truncate(&mut self, facet_order: BTreeMap) { + for (index, facets) in &mut self.0 { + let Some((order_by, max_values_per_facet)) = facet_order.get(index) else { + continue; + }; + for (facet, values) in &mut facets.distribution { + match order_by.get(facet) { + OrderBy::Lexicographic => { + values.sort_unstable_by(|left, _, right, _| left.cmp(right)) + } + OrderBy::Count => { + values.sort_unstable_by(|_, left, _, right| { + left.cmp(right) + // biggest first + .reverse() + }) + } + } + values.truncate(*max_values_per_facet); + } + } + } +} diff --git a/crates/meilisearch/src/search/federated/weighted_scores.rs b/crates/meilisearch/src/search/federated/weighted_scores.rs new file mode 100644 index 000000000..899940a31 --- /dev/null +++ b/crates/meilisearch/src/search/federated/weighted_scores.rs @@ -0,0 +1,88 @@ +use std::cmp::Ordering; + +use meilisearch_types::milli::score_details::{self, WeightedScoreValue}; + +pub fn compare( + mut left_it: impl Iterator, + left_weighted_global_score: f64, + mut right_it: impl Iterator, + right_weighted_global_score: f64, +) -> Ordering { + loop { + let left = left_it.next(); + let right = right_it.next(); + + match (left, right) { + (None, None) => return Ordering::Equal, + (None, Some(_)) => return Ordering::Less, + (Some(_), None) => return Ordering::Greater, + ( + Some( + WeightedScoreValue::WeightedScore(left) | WeightedScoreValue::VectorSort(left), + ), + Some( + WeightedScoreValue::WeightedScore(right) + | WeightedScoreValue::VectorSort(right), + ), + ) => { + if (left - right).abs() <= f64::EPSILON { + continue; + } + return left.partial_cmp(&right).unwrap(); + } + ( + Some(WeightedScoreValue::Sort { asc: left_asc, value: left }), + Some(WeightedScoreValue::Sort { asc: right_asc, value: right }), + ) => { + if left_asc != right_asc { + return left_weighted_global_score + .partial_cmp(&right_weighted_global_score) + .unwrap(); + } + match score_details::compare_sort_values(left_asc, &left, &right) { + Ordering::Equal => continue, + order => return order, + } + } + ( + Some(WeightedScoreValue::GeoSort { asc: left_asc, distance: left }), + Some(WeightedScoreValue::GeoSort { asc: right_asc, distance: right }), + ) => { + if left_asc != right_asc { + continue; + } + match (left, right) { + (None, None) => continue, + (None, Some(_)) => return Ordering::Less, + (Some(_), None) => return Ordering::Greater, + (Some(left), Some(right)) => { + if (left - right).abs() <= f64::EPSILON { + continue; + } + return left.partial_cmp(&right).unwrap(); + } + } + } + // not comparable details, use global + (Some(WeightedScoreValue::WeightedScore(_)), Some(_)) + | (Some(_), Some(WeightedScoreValue::WeightedScore(_))) + | (Some(WeightedScoreValue::VectorSort(_)), Some(_)) + | (Some(_), Some(WeightedScoreValue::VectorSort(_))) + | (Some(WeightedScoreValue::GeoSort { .. }), Some(WeightedScoreValue::Sort { .. })) + | (Some(WeightedScoreValue::Sort { .. }), Some(WeightedScoreValue::GeoSort { .. })) => { + let left_count = left_it.count(); + let right_count = right_it.count(); + // compare how many remaining groups of rules each side has. + // the group with the most remaining groups wins. + return left_count + .cmp(&right_count) + // breaks ties with the global ranking score + .then_with(|| { + left_weighted_global_score + .partial_cmp(&right_weighted_global_score) + .unwrap() + }); + } + } + } +} diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index aab8ae919..2091047fc 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -30,7 +30,7 @@ use milli::{ MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; use regex::Regex; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; #[cfg(test)] mod mod_test; @@ -41,7 +41,7 @@ use crate::error::MeilisearchHttpError; mod federated; pub use federated::{ perform_federated_search, FederatedSearch, FederatedSearchResult, Federation, - FederationOptions, MergeFacets, + FederationOptions, MergeFacets, PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE, }; mod ranking_rules; @@ -119,7 +119,7 @@ pub struct SearchQuery { pub locales: Option>, } -#[derive(Debug, Clone, Copy, PartialEq, Deserr, ToSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Deserr, ToSchema, Serialize)] #[deserr(try_from(f64) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] pub struct RankingScoreThreshold(f64); impl std::convert::TryFrom for RankingScoreThreshold { @@ -275,11 +275,13 @@ impl fmt::Debug for SearchQuery { } } -#[derive(Debug, Clone, Default, PartialEq, Deserr, ToSchema)] +#[derive(Debug, Clone, Default, PartialEq, Deserr, ToSchema, Serialize)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] pub struct HybridQuery { #[deserr(default, error = DeserrJsonError, default)] #[schema(value_type = f32, default)] + #[serde(default)] pub semantic_ratio: SemanticRatio, #[deserr(error = DeserrJsonError)] pub embedder: String, @@ -369,7 +371,7 @@ impl SearchKind { } } -#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[derive(Debug, Clone, Copy, PartialEq, Deserr, Serialize)] #[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)] pub struct SemanticRatio(f32); @@ -411,8 +413,9 @@ impl SearchQuery { // This struct contains the fields of `SearchQuery` inline. // This is because neither deserr nor serde support `flatten` when using `deny_unknown_fields. // The `From` implementation ensures both structs remain up to date. -#[derive(Debug, Clone, PartialEq, Deserr, ToSchema)] +#[derive(Debug, Clone, Serialize, PartialEq, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct SearchQueryWithIndex { #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_index_uid)] @@ -493,6 +496,72 @@ impl SearchQueryWithIndex { self.facets.as_deref().filter(|v| !v.is_empty()) } + pub fn from_index_query_federation( + index_uid: IndexUid, + query: SearchQuery, + federation_options: Option, + ) -> Self { + let SearchQuery { + q, + vector, + hybrid, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve, + retrieve_vectors, + attributes_to_crop, + crop_length, + attributes_to_highlight, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + ranking_score_threshold, + locales, + } = query; + + SearchQueryWithIndex { + index_uid, + q, + vector, + hybrid, + offset: if offset == DEFAULT_SEARCH_OFFSET() { None } else { Some(offset) }, + limit: if limit == DEFAULT_SEARCH_LIMIT() { None } else { Some(limit) }, + page, + hits_per_page, + attributes_to_retrieve, + retrieve_vectors, + attributes_to_crop, + crop_length, + attributes_to_highlight, + show_ranking_score, + show_ranking_score_details, + show_matches_position, + filter, + sort, + distinct, + facets, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + ranking_score_threshold, + locales, + federation_options, + } + } + pub fn into_index_query_federation(self) -> (IndexUid, SearchQuery, Option) { let SearchQueryWithIndex { index_uid, @@ -620,8 +689,9 @@ impl TryFrom for ExternalDocumentId { } } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, ToSchema)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, ToSchema, Serialize)] #[deserr(rename_all = camelCase)] +#[serde(rename_all = "camelCase")] pub enum MatchingStrategy { /// Remove query words from last to first Last, @@ -667,19 +737,19 @@ impl From for OrderBy { } } -#[derive(Debug, Clone, Serialize, PartialEq, ToSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)] pub struct SearchHit { #[serde(flatten)] #[schema(additional_properties, inline, value_type = HashMap)] pub document: Document, - #[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")] + #[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")] #[schema(additional_properties, value_type = HashMap)] pub formatted: Document, - #[serde(rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] pub matches_position: Option, - #[serde(rename = "_rankingScore", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")] pub ranking_score: Option, - #[serde(rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] pub ranking_score_details: Option>, } @@ -767,7 +837,7 @@ pub struct SearchResultWithIndex { pub result: SearchResult, } -#[derive(Serialize, Debug, Clone, PartialEq, Eq, ToSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, ToSchema)] #[serde(untagged)] pub enum HitsInfo { #[serde(rename_all = "camelCase")] @@ -778,7 +848,7 @@ pub enum HitsInfo { OffsetLimit { limit: usize, offset: usize, estimated_total_hits: usize }, } -#[derive(Serialize, Debug, Clone, PartialEq, ToSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, ToSchema)] pub struct FacetStats { pub min: f64, pub max: f64, @@ -1061,7 +1131,7 @@ pub fn perform_search( Ok(result) } -#[derive(Debug, Clone, Default, Serialize, ToSchema)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] pub struct ComputedFacets { #[schema(value_type = BTreeMap>)] pub distribution: BTreeMap>, diff --git a/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump b/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump new file mode 100644 index 000000000..a1816b79a Binary files /dev/null and b/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump differ diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 253929428..0aea7d722 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/authorization.rs b/crates/meilisearch/tests/auth/authorization.rs index 609b7d01b..277911fb8 100644 --- a/crates/meilisearch/tests/auth/authorization.rs +++ b/crates/meilisearch/tests/auth/authorization.rs @@ -68,6 +68,8 @@ pub static AUTHORIZATIONS: Lazy hashset!{"keys.get", "*"}, ("GET", "/experimental-features") => hashset!{"experimental.get", "*"}, ("PATCH", "/experimental-features") => hashset!{"experimental.update", "*"}, + ("GET", "/network") => hashset!{"network.get", "*"}, + ("PATCH", "/network") => hashset!{"network.update", "*"}, }; authorizations diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index c063b2aac..0e8968ef0 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index 70307ac25..6ef40be8e 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -41,9 +41,8 @@ async fn list_batches() { let index = server.index("test"); let (task, _status_code) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); let (response, code) = index.list_batches().await; assert_eq!(code, 200); assert_eq!( @@ -96,11 +95,12 @@ async fn list_batches_pagination_and_reverse() { async fn list_batches_with_star_filters() { let server = Server::new().await; let index = server.index("test"); - let (batch, _code) = index.create(None).await; - index.wait_task(batch.uid()).await.succeeded(); - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let index = server.index("test"); + let (task, _code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.service.get("/batches?indexUids=test").await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 2); @@ -187,9 +187,6 @@ async fn list_batches_invalid_canceled_by_filter() { let index = server.index("test"); let (task, _status_code) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; let (response, code) = index.filtered_batches(&[], &[], &["0"]).await; assert_eq!(code, 200, "{}", response); @@ -202,9 +199,8 @@ async fn list_batches_status_and_type_filtered() { let index = server.index("test"); let (task, _status_code) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.update(Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.filtered_batches(&["indexCreation"], &["failed"], &[]).await; assert_eq!(code, 200, "{}", response); @@ -212,7 +208,7 @@ async fn list_batches_status_and_type_filtered() { let (response, code) = index .filtered_batches( - &["indexCreation", "documentAdditionOrUpdate"], + &["indexCreation", "IndexUpdate"], &["succeeded", "processing", "enqueued"], &[], ) diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 1f42ed2ae..f78542db1 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -88,6 +88,10 @@ impl Server { self.service.api_key = Some(api_key.as_ref().to_string()); } + pub fn clear_api_key(&mut self) { + self.service.api_key = None; + } + /// Fetch and use the default admin key for nexts http requests. pub async fn use_admin_key(&mut self, master_key: impl AsRef) { self.use_api_key(master_key); @@ -159,10 +163,18 @@ impl Server { self.service.get("/tasks").await } + pub async fn batches(&self) -> (Value, StatusCode) { + self.service.get("/batches").await + } + pub async fn set_features(&self, value: Value) -> (Value, StatusCode) { self.service.patch("/experimental-features", value).await } + pub async fn set_network(&self, value: Value) -> (Value, StatusCode) { + self.service.patch("/network", value).await + } + pub async fn get_metrics(&self) -> (Value, StatusCode) { self.service.get("/metrics").await } @@ -408,6 +420,10 @@ impl Server { pub async fn get_features(&self) -> (Value, StatusCode) { self.service.get("/experimental-features").await } + + pub async fn get_network(&self) -> (Value, StatusCode) { + self.service.get("/network").await + } } pub fn default_settings(dir: impl AsRef) -> Opt { diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index 67dc87ad3..ad8bae19f 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1803,6 +1803,275 @@ async fn add_documents_with_geo_field() { "finishedAt": "[date]" } "###); + + let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "results": [ + { + "id": "1" + }, + { + "id": "2", + "_geo": null + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + } + }, + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); +} + +#[actix_rt::test] +async fn update_documents_with_geo_field() { + let server = Server::new().await; + let index = server.index("doggo"); + index.update_settings(json!({"sortableAttributes": ["_geo"]})).await; + + let documents = json!([ + { + "id": "1", + }, + { + "id": "2", + "_geo": null, + }, + { + "id": "3", + "_geo": { "lat": 1, "lng": 1 }, + }, + { + "id": "4", + "_geo": { "lat": "1", "lng": "1" }, + }, + ]); + + let (task, _status_code) = index.add_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 1, + "batchUid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); + + let updated_documents = json!([{ + "id": "3", + "doggo": "kefir", + }]); + let (task, _status_code) = index.update_documents(updated_documents, None).await; + let response = index.wait_task(task.uid()).await; + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 2, + "batchUid": 2, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "results": [ + { + "id": "1" + }, + { + "id": "2", + "_geo": null + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "doggo": "kefir" + }, + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // the search response should not have changed: we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "doggo": "kefir", + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 918343f94..62cc51f29 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -161,6 +161,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 4, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 3, "id": 4 @@ -208,6 +210,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 2, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 2 @@ -274,6 +278,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 1 diff --git a/crates/meilisearch/tests/dumps/data.rs b/crates/meilisearch/tests/dumps/data.rs index d353aaf1d..cb46aa41f 100644 --- a/crates/meilisearch/tests/dumps/data.rs +++ b/crates/meilisearch/tests/dumps/data.rs @@ -22,6 +22,7 @@ pub enum GetDump { TestV5, TestV6WithExperimental, + TestV6WithBatchesAndEnqueuedTasks, } impl GetDump { @@ -74,6 +75,10 @@ impl GetDump { "tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump" ) .into(), + GetDump::TestV6WithBatchesAndEnqueuedTasks => { + exist_relative_path!("tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump") + .into() + } } } } diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index a2b008fe3..1b07afdfd 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -27,9 +27,24 @@ async fn import_dump_v1_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -173,6 +188,8 @@ async fn import_dump_v1_movie_with_settings() { { "numberOfDocuments": 53, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "genres": 53, "id": 53, @@ -333,9 +350,24 @@ async fn import_dump_v1_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -483,9 +515,24 @@ async fn import_dump_v2_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -623,9 +670,24 @@ async fn import_dump_v2_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -773,9 +835,24 @@ async fn import_dump_v2_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -920,9 +997,24 @@ async fn import_dump_v3_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1060,9 +1152,24 @@ async fn import_dump_v3_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1210,9 +1317,24 @@ async fn import_dump_v3_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1357,9 +1479,24 @@ async fn import_dump_v4_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1497,9 +1634,24 @@ async fn import_dump_v4_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1647,9 +1799,24 @@ async fn import_dump_v4_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1798,33 +1965,35 @@ async fn import_dump_v5() { server.wait_task(task["uid"].as_u64().unwrap()).await; } - let expected_stats = json!({ - "numberOfDocuments": 10, - "isIndexing": false, - "fieldDistribution": { - "cast": 10, - "director": 10, - "genres": 10, - "id": 10, - "overview": 10, - "popularity": 10, - "poster_path": 10, - "producer": 10, - "production_companies": 10, - "release_date": 10, - "tagline": 10, - "title": 10, - "vote_average": 10, - "vote_count": 10 - } - }); - let index1 = server.index("test"); let index2 = server.index("test2"); let (stats, code) = index1.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 10, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (docs, code) = index2.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(code, @"200 OK"); @@ -1835,7 +2004,32 @@ async fn import_dump_v5() { let (stats, code) = index2.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 10, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (keys, code) = server.list_api_keys("").await; snapshot!(code, @"200 OK"); @@ -1908,7 +2102,9 @@ async fn import_dump_v6_containing_experimental_features() { "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -1992,6 +2188,63 @@ async fn import_dump_v6_containing_experimental_features() { .await; } +#[actix_rt::test] +async fn import_dump_v6_containing_batches_and_enqueued_tasks() { + let temp = tempfile::tempdir().unwrap(); + + let options = Opt { + import_dump: Some(GetDump::TestV6WithBatchesAndEnqueuedTasks.path()), + ..default_settings(temp.path()) + }; + let mut server = Server::new_auth_with_options(options, temp).await; + server.use_api_key("MASTER_KEY"); + server.wait_task(2).await.succeeded(); + let (tasks, _) = server.tasks().await; + snapshot!(json_string!(tasks, { ".results[1].startedAt" => "[date]", ".results[1].finishedAt" => "[date]", ".results[1].duration" => "[date]" }), name: "tasks"); + let (batches, _) = server.batches().await; + snapshot!(json_string!(batches, { ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].duration" => "[date]" }), name: "batches"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + assert_eq!(indexes["results"].as_array().unwrap().len(), 1); + assert_eq!(indexes["results"][0]["uid"], json!("kefir")); + assert_eq!(indexes["results"][0]["primaryKey"], json!("id")); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false + } + "###); + + let index = server.index("kefir"); + let (documents, _) = index.get_all_documents_raw("").await; + snapshot!(documents, @r#" + { + "results": [ + { + "id": 1, + "dog": "kefir" + }, + { + "id": 2, + "dog": "intel" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "#); +} + // In this test we must generate the dump ourselves to ensure the // `user provided` vectors are well set #[actix_rt::test] @@ -2069,7 +2322,9 @@ async fn generate_and_import_dump_containing_vectors() { "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap new file mode 100644 index 000000000..aeac6cf55 --- /dev/null +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap @@ -0,0 +1,78 @@ +--- +source: crates/meilisearch/tests/dumps/mod.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 2, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.144827890S", + "startedAt": "2025-02-04T10:15:21.275640274Z", + "finishedAt": "2025-02-04T10:15:21.420468164Z" + }, + { + "uid": 0, + "progress": null, + "details": {}, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexCreation": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.032902186S", + "startedAt": "2025-02-04T10:14:43.559526162Z", + "finishedAt": "2025-02-04T10:14:43.592428348Z" + } + ], + "total": 3, + "limit": 20, + "from": 2, + "next": null +} diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap new file mode 100644 index 000000000..99dc06f24 --- /dev/null +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap @@ -0,0 +1,78 @@ +--- +source: crates/meilisearch/tests/dumps/mod.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 3, + "batchUid": null, + "indexUid": null, + "status": "succeeded", + "type": "dumpCreation", + "canceledBy": null, + "details": { + "dumpUid": null + }, + "error": null, + "duration": "PT0.000629059S", + "enqueuedAt": "2025-02-04T10:22:31.318175268Z", + "startedAt": "2025-02-04T10:22:31.331701375Z", + "finishedAt": "2025-02-04T10:22:31.332330434Z" + }, + { + "uid": 2, + "batchUid": 2, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[date]", + "enqueuedAt": "2025-02-04T10:15:49.212484063Z", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 1, + "batchUid": null, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.144827890S", + "enqueuedAt": "2025-02-04T10:15:21.258630973Z", + "startedAt": "2025-02-04T10:15:21.275640274Z", + "finishedAt": "2025-02-04T10:15:21.420468164Z" + }, + { + "uid": 0, + "batchUid": null, + "indexUid": "kefir", + "status": "succeeded", + "type": "indexCreation", + "canceledBy": null, + "details": { + "primaryKey": null + }, + "error": null, + "duration": "PT0.032902186S", + "enqueuedAt": "2025-02-04T10:14:43.550379968Z", + "startedAt": "2025-02-04T10:14:43.559526162Z", + "finishedAt": "2025-02-04T10:14:43.592428348Z" + } + ], + "total": 4, + "limit": 20, + "from": 3, + "next": null +} diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index 8e1ac921d..d417efa4c 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -21,7 +21,9 @@ async fn experimental_features() { "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -33,7 +35,9 @@ async fn experimental_features() { "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -45,7 +49,9 @@ async fn experimental_features() { "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -58,7 +64,9 @@ async fn experimental_features() { "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -71,7 +79,9 @@ async fn experimental_features() { "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); } @@ -91,7 +101,9 @@ async fn experimental_feature_metrics() { "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false } "###); @@ -146,7 +158,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/crates/meilisearch/tests/integration.rs b/crates/meilisearch/tests/integration.rs index 7c3b8affe..927eb4617 100644 --- a/crates/meilisearch/tests/integration.rs +++ b/crates/meilisearch/tests/integration.rs @@ -7,6 +7,7 @@ mod dumps; mod features; mod index; mod logs; +mod network; mod search; mod settings; mod similar; diff --git a/crates/meilisearch/tests/network/mod.rs b/crates/meilisearch/tests/network/mod.rs new file mode 100644 index 000000000..1c3661a06 --- /dev/null +++ b/crates/meilisearch/tests/network/mod.rs @@ -0,0 +1,606 @@ +use serde_json::Value::Null; + +use crate::common::Server; +use crate::json; + +#[actix_rt::test] +async fn error_network_not_enabled() { + let server = Server::new().await; + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Using the /network route requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server.set_network(json!({"self": "myself"})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Using the /network route requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} + +#[actix_rt::test] +async fn errors_on_param() { + let server = Server::new().await; + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + // non-existing param + let (response, code) = server.set_network(json!({"selfie": "myself"})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Unknown field `selfie`: expected one of `remotes`, `self`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); + + // self not a string + let (response, code) = server.set_network(json!({"self": 42})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.self`: expected a string, but found a positive integer: `42`", + "code": "invalid_network_self", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_self" + } + "###); + + // remotes not an object + let (response, code) = server.set_network(json!({"remotes": 42})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes`: expected an object, but found a positive integer: `42`", + "code": "invalid_network_remotes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_remotes" + } + "###); + + // new remote without url + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "searchApiKey": "http://localhost:7700" + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Missing field `.remotes.new.url`", + "code": "missing_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_network_url" + } + "###); + + // remote with url not a string + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": 7700 + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes.new.url`: expected a string, but found a positive integer: `7700`", + "code": "invalid_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_url" + } + "###); + + // remote with non-existing param + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": "http://localhost:7700", + "doggo": "Intel the Beagle" + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Unknown field `doggo` inside `.remotes.new`: expected one of `url`, `searchApiKey`", + "code": "invalid_network_remotes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_remotes" + } + "###); + + // remote with non-string searchApiKey + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": "http://localhost:7700", + "searchApiKey": 1204664602099962445u64, + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes.new.searchApiKey`: expected a string, but found a positive integer: `1204664602099962445`", + "code": "invalid_network_search_api_key", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_search_api_key" + } + "###); + + // setting `null` on URL a posteriori + let (response, code) = server + .set_network(json!({"remotes": { + "kefir": { + "url": "http://localhost:7700", + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": { + "kefir": { + "url": "http://localhost:7700", + "searchApiKey": null + } + } + } + "###); + let (response, code) = server + .set_network(json!({"remotes": { + "kefir": { + "url": Null, + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Field `.remotes.kefir.url` cannot be set to `null`", + "code": "invalid_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_url" + } + "###); +} + +#[actix_rt::test] +async fn auth() { + let mut server = Server::new_auth().await; + server.use_api_key("MASTER_KEY"); + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + let (get_network_key, code) = server + .add_api_key(json!({ + "actions": ["network.get"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let get_network_key = get_network_key["key"].clone(); + + let (update_network_key, code) = server + .add_api_key(json!({ + "actions": ["network.update"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let update_network_key = update_network_key["key"].clone(); + + let (search_api_key, code) = server + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key = search_api_key["key"].clone(); + + // try with master key + let (response, code) = server + .set_network(json!({ + "self": "master" + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "master", + "remotes": {} + } + "###); + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "master", + "remotes": {} +} +"###); + + // try get with get permission + server.use_api_key(get_network_key.as_str().unwrap()); + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "master", + "remotes": {} +} +"###); + + // try update with update permission + server.use_api_key(update_network_key.as_str().unwrap()); + + let (response, code) = server + .set_network(json!({ + "self": "api_key" + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "api_key", + "remotes": {} +} +"###); + + // try with the other's permission + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + + server.use_api_key(get_network_key.as_str().unwrap()); + let (response, code) = server + .set_network(json!({ + "self": "get_api_key" + })) + .await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + // try either with bad permission + server.use_api_key(search_api_key.as_str().unwrap()); + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + + let (response, code) = server + .set_network(json!({ + "self": "get_api_key" + })) + .await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); +} + +#[actix_rt::test] +async fn get_and_set_network() { + let server = Server::new().await; + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": {} + } + "###); + + // adding self + let (response, code) = server.set_network(json!({"self": "myself"})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": {} + } + "###); + + // adding remotes + let (response, code) = server + .set_network(json!({"remotes": { + "myself": { + "url": "http://localhost:7700" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "foo" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "foo" + } + } + } + "###); + + // partially updating one remote + let (response, code) = server + .set_network(json!({"remotes": { + "thy": { + "searchApiKey": "bar" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // adding one remote + let (response, code) = server + .set_network(json!({"remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // deleting one remote + let (response, code) = server + .set_network(json!({"remotes": { + "myself": Null, + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // removing self + let (response, code) = server.set_network(json!({"self": Null})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // setting self again + let (response, code) = server.set_network(json!({"self": "thy"})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // doing nothing + let (response, code) = server.set_network(json!({})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // still doing nothing + let (response, code) = server.set_network(json!({"remotes": {}})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // good time to check GET + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // deleting everything + let (response, code) = server + .set_network(json!({ + "remotes": Null, + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": {} + } + "###); +} diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi/mod.rs similarity index 99% rename from crates/meilisearch/tests/search/multi.rs rename to crates/meilisearch/tests/search/multi/mod.rs index 4fc0aed7f..2a95a5dd2 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi/mod.rs @@ -5,6 +5,8 @@ use crate::common::Server; use crate::json; use crate::search::{SCORE_DOCUMENTS, VECTOR_DOCUMENTS}; +mod proxy; + #[actix_rt::test] async fn search_empty_list() { let server = Server::new().await; diff --git a/crates/meilisearch/tests/search/multi/proxy.rs b/crates/meilisearch/tests/search/multi/proxy.rs new file mode 100644 index 000000000..2c3b31bf1 --- /dev/null +++ b/crates/meilisearch/tests/search/multi/proxy.rs @@ -0,0 +1,2591 @@ +use std::sync::Arc; + +use actix_http::StatusCode; +use meili_snap::{json_string, snapshot}; +use wiremock::matchers::AnyMatcher; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::common::{Server, Value, SCORE_DOCUMENTS}; +use crate::json; + +#[actix_rt::test] +async fn error_feature() { + let server = Server::new().await; + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "remote": "toto" + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Performing a remote federated search requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "queryPosition": 42, + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Using `federationOptions.queryPosition` requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} + +#[actix_rt::test] +async fn error_params() { + let server = Server::new().await; + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "remote": 42 + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.queries[0].federationOptions.remote`: expected a string, but found a positive integer: `42`", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "queryPosition": "toto", + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.queries[0].federationOptions.queryPosition`: expected a positive integer, but found a string: `\"toto\"`", + "code": "invalid_multi_search_query_position", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_query_position" + } + "###); +} + +#[actix_rt::test] +async fn remote_sharding() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + let ms2 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms2.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + let (response, code) = ms2.set_network(json!({"self": "ms2"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms2", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let index2 = ms2.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index2.add_documents(json!(documents[3..5]), None).await; + index2.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + let ms2 = Arc::new(ms2); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + let rms2 = LocalMeili::new(ms2.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + "ms2": { + "url": rms2.url() + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms2.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms2.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_unregistered_remote() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "message": "Invalid `queries[2].federation_options.remote`: remote `ms2` is not registered", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "message": "Invalid `queries[2].federation_options.remote`: remote `ms2` is not registered", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); +} + +#[actix_rt::test] +async fn error_no_weighted_score() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { gobble_headers: true, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote hit does not contain `._federation.weightedScoreValues`\n - hint: check that the remote instance is a Meilisearch instance running the same version", + "code": "remote_bad_response", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_bad_response" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_response() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { + override_response_body: Some("Returning an HTML page".into()), + ..Default::default() + }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "could not parse response from the remote host as a federated search response:\n - response from remote: Returning an HTML page\n - hint: check that the remote instance is a Meilisearch instance running the same version", + "code": "remote_bad_response", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_bad_response" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "nottest", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.queries[1]`: Index `nottest` not found.\",\"code\":\"index_not_found\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#index_not_found\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request_facets_by_index() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test0"); + let index1 = ms1.index("test1"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": { + "facetsByIndex": { + "test0": [] + } + }, + "queries": [ + { + "q": query, + "indexUid": "test0", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test1", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test0", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test0", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "facetsByIndex": { + "test0": { + "distribution": {}, + "stats": {} + } + }, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.federation.facetsByIndex.test0`: Index `test0` not found.\\n - Note: index `test0` is not used in queries\",\"code\":\"index_not_found\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#index_not_found\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request_facets_by_index_facet() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index0.update_settings_filterable_attributes(json!(["id"])).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": { + "facetsByIndex": { + "test": ["id"] + } + }, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "facetsByIndex": { + "test": { + "distribution": { + "id": { + "A": 1, + "B": 1 + } + }, + "stats": {} + } + }, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.federation.facetsByIndex.test`: Invalid facet distribution, this index does not have configured filterable attributes.\\n - Note: index `test` used in `.queries[1]`\",\"code\":\"invalid_multi_search_facets\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#invalid_multi_search_facets\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_remote_does_not_answer() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + "ms2": { + "url": "https://thiswebsitedoesnotexist.example" + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": { + "ms2": { + "message": "error sending request", + "code": "remote_could_not_send_request", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_could_not_send_request" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": { + "ms2": { + "message": "error sending request", + "code": "remote_could_not_send_request", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_could_not_send_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_remote_404() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": format!("{}/this-route-does-not-exists/", rms1.url()) + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 404:\n - response from remote: null\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_sharding_auth() { + let ms0 = Server::new().await; + let mut ms1 = Server::new_auth().await; + ms1.use_api_key("MASTER_KEY"); + + let (search_api_key_not_enough_indexes, code) = ms1 + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["nottest"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key_not_enough_indexes = search_api_key_not_enough_indexes["key"].clone(); + + let (api_key_not_search, code) = ms1 + .add_api_key(json!({ + "actions": ["documents.*"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let api_key_not_search = api_key_not_search["key"].clone(); + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + ms1.clear_api_key(); + + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1-nottest": { + "url": rms1.url(), + "searchApiKey": search_api_key_not_enough_indexes + }, + "ms1-notsearch": { + "url": rms1.url(), + "searchApiKey": api_key_not_search + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-nottest" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-notsearch" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1-notsearch": { + "message": "could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action", + "code": "remote_invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#remote_invalid_api_key" + }, + "ms1-nottest": { + "message": "could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action", + "code": "remote_invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#remote_invalid_api_key" + } + } + } + "###); +} + +#[actix_rt::test] +async fn remote_sharding_auth() { + let ms0 = Server::new().await; + let mut ms1 = Server::new_auth().await; + ms1.use_api_key("MASTER_KEY"); + + let (search_api_key, code) = ms1 + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key = search_api_key["key"].clone(); + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + ms1.clear_api_key(); + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url(), + "searchApiKey": "MASTER_KEY" + }, + "ms1-alias": { + "url": rms1.url(), + "searchApiKey": search_api_key + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-alias" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1-alias" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_500() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { fails: FailurePolicy::Always, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 500:\n - response from remote: {\"error\":\"provoked error\",\"code\":\"test_error\",\"link\":\"https://docs.meilisearch.com/errors#test_error\"}", + "code": "remote_remote_error", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_remote_error" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + // the response if full because we queried the instance that works + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_500_once() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { fails: FailurePolicy::Once, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + // Meilisearch is tolerant to a single failure + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_timeout() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { delay: Some(std::time::Duration::from_secs(6)), ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host did not answer before the deadline", + "code": "remote_timeout", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_timeout" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +// test: try all the flattened structs in queries + +// working facet tests with and without merge + +#[derive(Default)] +pub enum FailurePolicy { + #[default] + Never, + Once, + Always, +} + +/// Parameters to change the behavior of the [`LocalMeili`] server. +#[derive(Default)] +pub struct LocalMeiliParams { + /// delay the response by the specified duration + pub delay: Option, + pub fails: FailurePolicy, + /// replace the reponse body with the provided String + pub override_response_body: Option, + pub gobble_headers: bool, +} + +/// A server that exploits [`MockServer`] to provide an URL for testing network and the network. +pub struct LocalMeili { + mock_server: MockServer, +} + +impl LocalMeili { + pub async fn new(server: Arc) -> Self { + Self::with_params(server, Default::default()).await + } + + pub async fn with_params(server: Arc, params: LocalMeiliParams) -> Self { + let mock_server = MockServer::start().await; + + // tokio won't let us execute asynchronous code from a sync function inside of an async test, + // so instead we spawn another thread that will call the service on a brand new tokio runtime + // and communicate via channels... + let (request_sender, request_receiver) = crossbeam_channel::bounded::(0); + let (response_sender, response_receiver) = + crossbeam_channel::bounded::<(Value, StatusCode)>(0); + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + while let Ok(req) = request_receiver.recv() { + let body = std::str::from_utf8(&req.body).unwrap(); + let headers: Vec<(&str, &str)> = if params.gobble_headers { + vec![("Content-Type", "application/json")] + } else { + req.headers + .iter() + .map(|(name, value)| (name.as_str(), value.to_str().unwrap())) + .collect() + }; + let (value, code) = rt.block_on(async { + match req.method.as_str() { + "POST" => server.service.post_str(&req.url, body, headers.clone()).await, + "PUT" => server.service.put_str(&req.url, body, headers).await, + "PATCH" => server.service.patch(&req.url, req.body_json().unwrap()).await, + "GET" => server.service.get(&req.url).await, + "DELETE" => server.service.delete(&req.url).await, + _ => unimplemented!(), + } + }); + if response_sender.send((value, code)).is_err() { + break; + } + } + println!("exiting mock thread") + }); + + let failed_already = std::sync::atomic::AtomicBool::new(false); + + Mock::given(AnyMatcher) + .respond_with(move |req: &wiremock::Request| { + if let Some(delay) = params.delay { + std::thread::sleep(delay); + } + match params.fails { + FailurePolicy::Never => {} + FailurePolicy::Once => { + let failed_already = + failed_already.fetch_or(true, std::sync::atomic::Ordering::AcqRel); + if !failed_already { + return fail(params.override_response_body.as_deref()); + } + } + FailurePolicy::Always => return fail(params.override_response_body.as_deref()), + } + request_sender.send(req.clone()).unwrap(); + let (value, code) = response_receiver.recv().unwrap(); + let response = ResponseTemplate::new(code.as_u16()); + if let Some(override_response_body) = params.override_response_body.as_deref() { + response.set_body_string(override_response_body) + } else { + response.set_body_json(value) + } + }) + .mount(&mock_server) + .await; + Self { mock_server } + } + + pub fn url(&self) -> String { + self.mock_server.uri() + } +} + +fn fail(override_response_body: Option<&str>) -> ResponseTemplate { + let response = ResponseTemplate::new(500); + if let Some(override_response_body) = override_response_body { + response.set_body_string(override_response_body) + } else { + response.set_body_json(json!({"error": "provoked error", "code": "test_error", "link": "https://docs.meilisearch.com/errors#test_error"})) + } +} diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index 1b4e458d3..bb10d2cd5 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -1,3 +1,4 @@ +use meili_snap::{json_string, snapshot}; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; @@ -74,3 +75,253 @@ async fn stats() { assert_eq!(response["indexes"]["test"]["fieldDistribution"]["name"], 1); assert_eq!(response["indexes"]["test"]["fieldDistribution"]["age"], 1); } + +#[actix_rt::test] +async fn add_remove_embeddings() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 5 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 5, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 3 embeddings in total + let documents = json!([ + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": null, "handcrafted": [0, 0, 0] }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 1 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null, "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} + +#[actix_rt::test] +async fn add_remove_embedded_documents() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 5 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 5, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // delete one embedded document, remaining 1 embedded documents for 3 embeddings in total + let (response, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 1, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 1, + "name": 1 + } + } + "###); +} + +#[actix_rt::test] +async fn update_embedder_settings() { + let server = Server::new().await; + let index = server.index("doggo"); + + // 2 embedded documents for 3 embeddings in total + // but no embedders are added in the settings yet so we expect 0 embedded documents for 0 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // add embedders to the settings + // 2 embedded documents for 3 embeddings in total + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index da809de7f..6aab2861a 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -126,14 +126,17 @@ async fn check_the_index_scheduler(server: &Server) { "#); // And their metadata are still right let (stats, _) = server.stats().await; - snapshot!(stats, @r#" + snapshot!(stats, @r###" { "databaseSize": 438272, + "usedDatabaseSize": 196608, "lastUpdate": "2025-01-23T11:36:22.634859166Z", "indexes": { "kefir": { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -144,7 +147,7 @@ async fn check_the_index_scheduler(server: &Server) { } } } - "#); + "###); // Wait until the upgrade has been applied to all indexes to avoid flakyness let (tasks, _) = server.tasks_filter("types=upgradeDatabase&limit=1").await; @@ -205,14 +208,17 @@ async fn check_the_index_scheduler(server: &Server) { snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41"); let (stats, _) = server.stats().await; - snapshot!(stats, @r#" + snapshot!(stats, @r###" { "databaseSize": 438272, + "usedDatabaseSize": 196608, "lastUpdate": "2025-01-23T11:36:22.634859166Z", "indexes": { "kefir": { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -223,13 +229,15 @@ async fn check_the_index_scheduler(server: &Server) { } } } - "#); + "###); let index = server.index("kefir"); let (stats, _) = index.stats().await; - snapshot!(stats, @r#" + snapshot!(stats, @r###" { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -238,7 +246,7 @@ async fn check_the_index_scheduler(server: &Server) { "surname": 1 } } - "#); + "###); // Delete all the tasks of a specific batch let (task, _) = server.delete_tasks("batchUids=10").await; diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 2aae67ebf..97fa496b4 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -32,7 +32,7 @@ async fn field_unavailable_for_source() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`", + "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `documentTemplateMaxBytes`, `dimensions`, `distribution`, `url`, `binaryQuantized`", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index c2444fab6..8a8b774b8 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -1,19 +1,26 @@ use std::fs::{read_dir, read_to_string, remove_file, File}; -use std::io::BufWriter; +use std::io::{BufWriter, Write as _}; use std::path::PathBuf; +use std::time::Instant; -use anyhow::Context; -use clap::{Parser, Subcommand}; +use anyhow::{bail, Context}; +use clap::{Parser, Subcommand, ValueEnum}; use dump::{DumpWriter, IndexMetadata}; use file_store::FileStore; use meilisearch_auth::AuthController; -use meilisearch_types::heed::types::{SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; +use meilisearch_types::batches::Batch; +use meilisearch_types::heed::types::{Bytes, SerdeJson, Str}; +use meilisearch_types::heed::{ + CompactionOption, Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, +}; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::Index; +use serde_json::Value::Object; use time::macros::format_description; use time::OffsetDateTime; use upgrade::OfflineUpgrade; @@ -65,6 +72,24 @@ enum Command { skip_enqueued_tasks: bool, }, + /// Exports the documents of an index in NDJSON format from a Meilisearch index to stdout. + /// + /// This command can be executed on a running Meilisearch database. However, please note that + /// it will maintain a read-only transaction for the duration of the extraction process. + ExportDocuments { + /// The index name to export the documents from. + #[arg(long)] + index_name: String, + + /// Do not export vectors with the documents. + #[arg(long)] + ignore_vectors: bool, + + /// The number of documents to skip. + #[arg(long)] + offset: Option, + }, + /// Attempts to upgrade from one major version to the next without a dump. /// /// Make sure to run this commmand when Meilisearch is not running! @@ -78,6 +103,46 @@ enum Command { #[arg(long)] target_version: String, }, + + /// Compact the index by using LMDB. + /// + /// You must run this command while Meilisearch is off. The reason is that Meilisearch keep the + /// indexes opened and this compaction operation writes into another file. Meilisearch will not + /// switch to the new file. + /// + /// **Another possibility** is to keep Meilisearch running to serve search requests, run the + /// compaction and once done, close and immediately reopen Meilisearch. This way Meilisearch + /// will reopened the data.mdb file when rebooting and see the newly compacted file, ignoring + /// the previous non-compacted data. + /// + /// Note that the compaction will open the index, copy and compact the index into another file + /// **on the same disk as the index** and replace the previous index with the newly compacted + /// one. This means that the disk must have enough room for at most two times the index size. + /// + /// To make sure not to lose any data, this tool takes a mutable transaction on the index + /// before running the copy and compaction. This way the current indexation must finish before + /// the compaction operation can start. Once the compaction is done, the big index is replaced + /// by the compacted one and the mutable transaction is released. + CompactIndex { index_name: String }, + + /// Uses the hair dryer the dedicate pages hot in cache + /// + /// To make the index faster we must make sure it is hot in the DB cache that's the cure of + /// memory-mapping but also it's strengh. This command is designed to make a spcific part of + /// the index hot in cache. + HairDryer { + #[arg(long, value_delimiter = ',')] + index_name: Vec, + + #[arg(long, value_delimiter = ',')] + index_part: Vec, + }, +} + +#[derive(Clone, ValueEnum)] +enum IndexPart { + /// Will make the arroy index hot. + Arroy, } fn main() -> anyhow::Result<()> { @@ -90,10 +155,17 @@ fn main() -> anyhow::Result<()> { Command::ExportADump { dump_dir, skip_enqueued_tasks } => { export_a_dump(db_path, dump_dir, skip_enqueued_tasks, detected_version) } + Command::ExportDocuments { index_name, ignore_vectors, offset } => { + export_documents(db_path, index_name, ignore_vectors, offset) + } Command::OfflineUpgrade { target_version } => { let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?; OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade() } + Command::CompactIndex { index_name } => compact_index(db_path, &index_name), + Command::HairDryer { index_name, index_part } => { + hair_dryer(db_path, &index_name, &index_part) + } } } @@ -230,70 +302,86 @@ fn export_a_dump( eprintln!("Successfully dumped {count} keys!"); + eprintln!("Dumping the queue"); let rtxn = env.read_txn()?; let all_tasks: Database> = try_opening_database(&env, &rtxn, "all-tasks")?; + let all_batches: Database> = + try_opening_database(&env, &rtxn, "all-batches")?; let index_mapping: Database = try_opening_database(&env, &rtxn, "index-mapping")?; - if skip_enqueued_tasks { - eprintln!("Skip dumping the enqueued tasks..."); - } else { - let mut dump_tasks = dump.create_tasks_queue()?; - let mut count = 0; - for ret in all_tasks.iter(&rtxn)? { - let (_, t) = ret?; - let status = t.status; - let content_file = t.content_uuid(); + eprintln!("Dumping the tasks"); + let mut dump_tasks = dump.create_tasks_queue()?; + let mut count_tasks = 0; + let mut count_enqueued_tasks = 0; + for ret in all_tasks.iter(&rtxn)? { + let (_, t) = ret?; + let status = t.status; + let content_file = t.content_uuid(); - let mut dump_content_file = dump_tasks.push_task(&t.into())?; + if status == Status::Enqueued && skip_enqueued_tasks { + continue; + } - // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. - if let Some(content_file_uuid) = content_file { - if status == Status::Enqueued { - let content_file = file_store.get_update(content_file_uuid)?; + let mut dump_content_file = dump_tasks.push_task(&t.into())?; - if (detected_version.0, detected_version.1, detected_version.2) < (1, 12, 0) { - eprintln!("Dumping the enqueued tasks reading them in obkv format..."); - let reader = - DocumentsBatchReader::from_reader(content_file).with_context(|| { - format!("While reading content file {:?}", content_file_uuid) - })?; - let (mut cursor, documents_batch_index) = - reader.into_cursor_and_fields_index(); - while let Some(doc) = cursor.next_document().with_context(|| { - format!("While iterating on content file {:?}", content_file_uuid) - })? { - dump_content_file - .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; - } - } else { - eprintln!( - "Dumping the enqueued tasks reading them in JSON stream format..." - ); - for document in - serde_json::de::Deserializer::from_reader(content_file).into_iter() - { - let document = document.with_context(|| { - format!("While reading content file {:?}", content_file_uuid) - })?; - dump_content_file.push_document(&document)?; - } + // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. + if let Some(content_file_uuid) = content_file { + if status == Status::Enqueued { + let content_file = file_store.get_update(content_file_uuid)?; + + if (detected_version.0, detected_version.1, detected_version.2) < (1, 12, 0) { + eprintln!("Dumping the enqueued tasks reading them in obkv format..."); + let reader = + DocumentsBatchReader::from_reader(content_file).with_context(|| { + format!("While reading content file {:?}", content_file_uuid) + })?; + let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); + while let Some(doc) = cursor.next_document().with_context(|| { + format!("While iterating on content file {:?}", content_file_uuid) + })? { + dump_content_file + .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; + } + } else { + eprintln!("Dumping the enqueued tasks reading them in JSON stream format..."); + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() + { + let document = document.with_context(|| { + format!("While reading content file {:?}", content_file_uuid) + })?; + dump_content_file.push_document(&document)?; } - - dump_content_file.flush()?; - count += 1; } + + dump_content_file.flush()?; + count_enqueued_tasks += 1; } } - dump_tasks.flush()?; - - eprintln!("Successfully dumped {count} enqueued tasks!"); + count_tasks += 1; } + dump_tasks.flush()?; + eprintln!( + "Successfully dumped {count_tasks} tasks including {count_enqueued_tasks} enqueued tasks!" + ); + // 4. dump the batches + eprintln!("Dumping the batches"); + let mut dump_batches = dump.create_batches_queue()?; + let mut count = 0; + + for ret in all_batches.iter(&rtxn)? { + let (_, b) = ret?; + dump_batches.push_batch(&b)?; + count += 1; + } + dump_batches.flush()?; + eprintln!("Successfully dumped {count} batches!"); + + // 5. Dump the indexes eprintln!("Dumping the indexes..."); - - // 4. Dump the indexes let mut count = 0; for result in index_mapping.iter(&rtxn)? { let (uid, uuid) = result?; @@ -314,14 +402,14 @@ fn export_a_dump( let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - // 4.1. Dump the documents + // 5.1. Dump the documents for ret in index.all_documents(&rtxn)? { let (_id, doc) = ret?; let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?; index_dumper.push_document(&document)?; } - // 4.2. Dump the settings + // 5.2. Dump the settings let settings = meilisearch_types::settings::settings( &index, &rtxn, @@ -347,3 +435,241 @@ fn export_a_dump( Ok(()) } + +fn compact_index(db_path: PathBuf, index_name: &str) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + + if uid != index_name { + eprintln!("Found index {uid} and skipping it"); + continue; + } else { + eprintln!("Found index {uid} 🎉"); + } + + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + eprintln!("Awaiting for a mutable transaction..."); + let _wtxn = index.write_txn().context("While awaiting for a write transaction")?; + + // We create and immediately drop the file because the + let non_compacted_index_file_path = index_path.join("data.mdb"); + let compacted_index_file_path = index_path.join("data.mdb.cpy"); + + eprintln!("Compacting the index..."); + let before_compaction = Instant::now(); + let new_file = index + .copy_to_file(&compacted_index_file_path, CompactionOption::Enabled) + .with_context(|| format!("While compacting {}", compacted_index_file_path.display()))?; + + let after_size = new_file.metadata()?.len(); + let before_size = std::fs::metadata(&non_compacted_index_file_path) + .with_context(|| { + format!( + "While retrieving the metadata of {}", + non_compacted_index_file_path.display(), + ) + })? + .len(); + + let reduction = before_size as f64 / after_size as f64; + println!("Compaction successful. Took around {:.2?}", before_compaction.elapsed()); + eprintln!("The index went from {before_size} bytes to {after_size} bytes ({reduction:.2}x reduction)"); + + eprintln!("Replacing the non-compacted index by the compacted one..."); + std::fs::rename(&compacted_index_file_path, &non_compacted_index_file_path).with_context( + || { + format!( + "While renaming {} into {}", + compacted_index_file_path.display(), + non_compacted_index_file_path.display(), + ) + }, + )?; + + drop(new_file); + + println!("Everything's done 🎉"); + return Ok(()); + } + + bail!("Target index {index_name} not found!") +} + +fn export_documents( + db_path: PathBuf, + index_name: String, + ignore_vectors: bool, + offset: Option, +) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + if uid == index_name { + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = + Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + let rtxn = index.read_txn()?; + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; + + if let Some(offset) = offset { + eprintln!("Skipping {offset} documents"); + } + + let mut stdout = BufWriter::new(std::io::stdout()); + let all_documents = index.documents_ids(&rtxn)?.into_iter().skip(offset.unwrap_or(0)); + for (i, ret) in index.iter_documents(&rtxn, all_documents)?.enumerate() { + let (id, doc) = ret?; + let mut document = obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + if i % 10_000 == 0 { + eprintln!("Starting the {}th document", i + offset.unwrap_or(0)); + } + + if !ignore_vectors { + 'inject_vectors: { + let embeddings = index.embeddings(&rtxn, id)?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(Object(Default::default())); + + let Object(vectors) = vectors else { + return Err(meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&rtxn, std::iter::once(id)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={id}") + } + }, + value: vectors.clone(), + }, + ) + .into()); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors + .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + } + + serde_json::to_writer(&mut stdout, &document)?; + } + + stdout.flush()?; + } else { + eprintln!("Found index {uid} but it's not the right index..."); + } + } + + Ok(()) +} + +fn hair_dryer( + db_path: PathBuf, + index_names: &[String], + index_parts: &[IndexPart], +) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + eprintln!("Trying to get a read transaction on the index scheduler..."); + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + if index_names.iter().any(|i| i == uid) { + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = + Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + eprintln!("Trying to get a read transaction on the {uid} index..."); + + let rtxn = index.read_txn()?; + for part in index_parts { + match part { + IndexPart::Arroy => { + let mut count = 0; + let total = index.vector_arroy.len(&rtxn)?; + eprintln!("Hair drying arroy for {uid}..."); + for (i, result) in index + .vector_arroy + .remap_types::() + .iter(&rtxn)? + .enumerate() + { + let (key, value) = result?; + + // All of this just to avoid compiler optimizations 🤞 + // We must read all the bytes to make the pages hot in cache. + // + count += std::hint::black_box(key.iter().fold(0, |acc, _| acc + 1)); + count += std::hint::black_box(value.iter().fold(0, |acc, _| acc + 1)); + + if i % 10_000 == 0 { + let perc = (i as f64) / (total as f64) * 100.0; + eprintln!("Visited {i}/{total} ({perc:.2}%) keys") + } + } + eprintln!("Done hair drying a total of at least {count} bytes."); + } + } + } + } else { + eprintln!("Found index {uid} but it's not the right index..."); + } + } + + Ok(()) +} diff --git a/crates/milli/README.md b/crates/milli/README.md index 8f04d04dc..101da0684 100644 --- a/crates/milli/README.md +++ b/crates/milli/README.md @@ -1,5 +1,5 @@

- the milli logo + the milli logo

a concurrent indexer combined with fast and relevant search algorithms

diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 6c7534553..df1baed3c 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec; use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -1731,6 +1731,18 @@ impl Index { let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } + + pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { + let mut stats = ArroyStats::default(); + let embedding_configs = self.embedding_configs(rtxn)?; + for config in embedding_configs { + let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + reader.aggregate_stats(rtxn, &mut stats)?; + } + Ok(stats) + } } #[derive(Debug, Deserialize, Serialize)] diff --git a/crates/milli/src/score_details.rs b/crates/milli/src/score_details.rs index 1efa3b8e6..940e5f395 100644 --- a/crates/milli/src/score_details.rs +++ b/crates/milli/src/score_details.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; use itertools::Itertools; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::distance_between_two_points; @@ -36,6 +36,15 @@ enum RankOrValue<'a> { Score(f64), } +#[derive(Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum WeightedScoreValue { + WeightedScore(f64), + Sort { asc: bool, value: serde_json::Value }, + GeoSort { asc: bool, distance: Option }, + VectorSort(f64), +} + impl ScoreDetails { pub fn local_score(&self) -> Option { self.rank().map(Rank::local_score) @@ -87,6 +96,30 @@ impl ScoreDetails { }) } + pub fn weighted_score_values<'a>( + details: impl Iterator + 'a, + weight: f64, + ) -> impl Iterator + 'a { + details + .map(ScoreDetails::rank_or_value) + .coalesce(|left, right| match (left, right) { + (RankOrValue::Rank(left), RankOrValue::Rank(right)) => { + Ok(RankOrValue::Rank(Rank::merge(left, right))) + } + (left, right) => Err((left, right)), + }) + .map(move |rank_or_value| match rank_or_value { + RankOrValue::Rank(r) => WeightedScoreValue::WeightedScore(r.local_score() * weight), + RankOrValue::Sort(s) => { + WeightedScoreValue::Sort { asc: s.ascending, value: s.value.clone() } + } + RankOrValue::GeoSort(g) => { + WeightedScoreValue::GeoSort { asc: g.ascending, distance: g.distance() } + } + RankOrValue::Score(s) => WeightedScoreValue::VectorSort(s * weight), + }) + } + fn rank_or_value(&self) -> RankOrValue<'_> { match self { ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()), @@ -423,34 +456,58 @@ pub struct Sort { pub value: serde_json::Value, } +pub fn compare_sort_values( + ascending: bool, + left: &serde_json::Value, + right: &serde_json::Value, +) -> Ordering { + use serde_json::Value::*; + match (left, right) { + (Null, Null) => Ordering::Equal, + (Null, _) => Ordering::Less, + (_, Null) => Ordering::Greater, + // numbers are always before strings + (Number(_), String(_)) => Ordering::Greater, + (String(_), Number(_)) => Ordering::Less, + (Number(left), Number(right)) => { + // FIXME: unwrap permitted here? + let order = left + .as_f64() + .unwrap() + .partial_cmp(&right.as_f64().unwrap()) + .unwrap_or(Ordering::Equal); + // 12 < 42, and when ascending, we want to see 12 first, so the smallest. + // Hence, when ascending, smaller is better + if ascending { + order.reverse() + } else { + order + } + } + (String(left), String(right)) => { + let order = left.cmp(right); + // Taking e.g. "a" and "z" + // "a" < "z", and when ascending, we want to see "a" first, so the smallest. + // Hence, when ascending, smaller is better + if ascending { + order.reverse() + } else { + order + } + } + (left, right) => { + tracing::warn!(%left, %right, "sort values that are neither numbers, strings or null, handling as equal"); + Ordering::Equal + } + } +} + impl PartialOrd for Sort { fn partial_cmp(&self, other: &Self) -> Option { if self.ascending != other.ascending { return None; } - match (&self.value, &other.value) { - (serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal), - (serde_json::Value::Null, _) => Some(Ordering::Less), - (_, serde_json::Value::Null) => Some(Ordering::Greater), - // numbers are always before strings - (serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater), - (serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less), - (serde_json::Value::Number(left), serde_json::Value::Number(right)) => { - // FIXME: unwrap permitted here? - let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?; - // 12 < 42, and when ascending, we want to see 12 first, so the smallest. - // Hence, when ascending, smaller is better - Some(if self.ascending { order.reverse() } else { order }) - } - (serde_json::Value::String(left), serde_json::Value::String(right)) => { - let order = left.cmp(right); - // Taking e.g. "a" and "z" - // "a" < "z", and when ascending, we want to see "a" first, so the smallest. - // Hence, when ascending, smaller is better - Some(if self.ascending { order.reverse() } else { order }) - } - _ => None, - } + Some(compare_sort_values(self.ascending, &self.value, &other.value)) } } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 83d00caf0..7f333d548 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -11,7 +11,7 @@ use either::Either; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use simple_token_kind::SimpleTokenKind; use utoipa::ToSchema; @@ -101,11 +101,11 @@ impl FormatOptions { } } -#[derive(Serialize, Debug, Clone, PartialEq, Eq, ToSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, ToSchema)] pub struct MatchBounds { pub start: usize, pub length: usize, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(skip_serializing_if = "Option::is_none", default)] pub indices: Option>, } diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 49f08b521..b9161b417 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -563,7 +563,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } -#[tracing::instrument(level = "trace", skip_all, target = "search::universe")] +#[tracing::instrument(level = "debug", skip_all, target = "search::universe")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>, diff --git a/crates/milli/src/thread_pool_no_abort.rs b/crates/milli/src/thread_pool_no_abort.rs index 14e5b0491..b57050a63 100644 --- a/crates/milli/src/thread_pool_no_abort.rs +++ b/crates/milli/src/thread_pool_no_abort.rs @@ -1,4 +1,4 @@ -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use rayon::{ThreadPool, ThreadPoolBuilder}; @@ -9,6 +9,8 @@ use thiserror::Error; #[derive(Debug)] pub struct ThreadPoolNoAbort { thread_pool: ThreadPool, + /// The number of active operations. + active_operations: AtomicUsize, /// Set to true if the thread pool catched a panic. pool_catched_panic: Arc, } @@ -19,7 +21,9 @@ impl ThreadPoolNoAbort { OP: FnOnce() -> R + Send, R: Send, { + self.active_operations.fetch_add(1, Ordering::Relaxed); let output = self.thread_pool.install(op); + self.active_operations.fetch_sub(1, Ordering::Relaxed); // While reseting the pool panic catcher we return an error if we catched one. if self.pool_catched_panic.swap(false, Ordering::SeqCst) { Err(PanicCatched) @@ -31,6 +35,11 @@ impl ThreadPoolNoAbort { pub fn current_num_threads(&self) -> usize { self.thread_pool.current_num_threads() } + + /// The number of active operations. + pub fn active_operations(&self) -> usize { + self.active_operations.load(Ordering::Relaxed) + } } #[derive(Error, Debug)] @@ -64,6 +73,10 @@ impl ThreadPoolNoAbortBuilder { let catched_panic = pool_catched_panic.clone(); move |_result| catched_panic.store(true, Ordering::SeqCst) }); - Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic }) + Ok(ThreadPoolNoAbort { + thread_pool: self.0.build()?, + active_operations: AtomicUsize::new(0), + pool_catched_panic, + }) } } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7e2229950..4fff31a35 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -5,6 +5,8 @@ use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; use std::ops::Range; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::Arc; use std::time::Duration; use bbqueue::framed::{FrameGrantR, FrameProducer}; @@ -71,12 +73,23 @@ pub fn extractor_writer_bbqueue( consumer }); + let sent_messages_attempts = Arc::new(AtomicUsize::new(0)); + let blocking_sent_messages_attempts = Arc::new(AtomicUsize::new(0)); + let (sender, receiver) = flume::bounded(channel_capacity); - let sender = ExtractorBbqueueSender { sender, producers, max_grant }; + let sender = ExtractorBbqueueSender { + sender, + producers, + max_grant, + sent_messages_attempts: sent_messages_attempts.clone(), + blocking_sent_messages_attempts: blocking_sent_messages_attempts.clone(), + }; let receiver = WriterBbqueueReceiver { receiver, look_at_consumer: (0..consumers.len()).cycle(), consumers, + sent_messages_attempts, + blocking_sent_messages_attempts, }; (sender, receiver) } @@ -92,6 +105,12 @@ pub struct ExtractorBbqueueSender<'a> { /// It will never be able to store more than that as the /// buffer cannot split data into two parts. max_grant: usize, + /// The total number of attempts to send messages + /// over the bbqueue channel. + sent_messages_attempts: Arc, + /// The number of times an attempt to send a + /// messages failed and we had to pause for a bit. + blocking_sent_messages_attempts: Arc, } pub struct WriterBbqueueReceiver<'a> { @@ -104,6 +123,12 @@ pub struct WriterBbqueueReceiver<'a> { look_at_consumer: Cycle>, /// The BBQueue frames to read when waking-up. consumers: Vec>, + /// The total number of attempts to send messages + /// over the bbqueue channel. + sent_messages_attempts: Arc, + /// The number of times an attempt to send a + /// message failed and we had to pause for a bit. + blocking_sent_messages_attempts: Arc, } /// The action to perform on the receiver/writer side. @@ -169,6 +194,16 @@ impl<'a> WriterBbqueueReceiver<'a> { } None } + + /// Returns the total count of attempts to send messages through the BBQueue channel. + pub fn sent_messages_attempts(&self) -> usize { + self.sent_messages_attempts.load(atomic::Ordering::Relaxed) + } + + /// Returns the count of attempts to send messages that had to be paused due to BBQueue being full. + pub fn blocking_sent_messages_attempts(&self) -> usize { + self.blocking_sent_messages_attempts.load(atomic::Ordering::Relaxed) + } } pub struct FrameWithHeader<'a> { @@ -458,10 +493,17 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { - payload_header.serialize_into(grant); - Ok(()) - })?; + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + payload_header.serialize_into(grant); + Ok(()) + }, + )?; Ok(()) } @@ -500,20 +542,28 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); - if dimensions != 0 { - let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); - for (embedding, output) in embeddings.iter().zip(output_iter) { - output.copy_from_slice(bytemuck::cast_slice(embedding)); + if dimensions != 0 { + let output_iter = + remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } } - } - Ok(()) - })?; + Ok(()) + }, + )?; Ok(()) } @@ -571,13 +621,20 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); - key_value_writer(key_buffer, value_buffer) - })?; + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer) + }, + )?; Ok(()) } @@ -619,12 +676,19 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - key_writer(remaining) - })?; + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining) + }, + )?; Ok(()) } @@ -637,12 +701,18 @@ fn reserve_and_write_grant( producer: &mut FrameProducer, total_length: usize, sender: &flume::Sender, + sent_messages_attempts: &AtomicUsize, + blocking_sent_messages_attempts: &AtomicUsize, f: F, ) -> crate::Result<()> where F: FnOnce(&mut [u8]) -> crate::Result<()>, { loop { + // An attempt means trying multiple times + // whether is succeeded or not. + sent_messages_attempts.fetch_add(1, atomic::Ordering::Relaxed); + for _ in 0..10_000 { match producer.grant(total_length) { Ok(mut grant) => { @@ -666,6 +736,10 @@ where return Err(Error::InternalError(InternalError::AbortedIndexation)); } + // We made an attempt to send a message in the + // bbqueue channel but it didn't succeed. + blocking_sent_messages_attempts.fetch_add(1, atomic::Ordering::Relaxed); + // We prefer to yield and allow the writing thread // to do its job, especially beneficial when there // is only one CPU core available. diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 8a71d7295..2de9f384b 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -144,7 +144,7 @@ impl<'doc> Update<'doc> { )?) } - pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> { + pub fn only_changed_fields(&self) -> DocumentFromVersions<'_, 'doc> { DocumentFromVersions::new(&self.new) } @@ -182,7 +182,7 @@ impl<'doc> Update<'doc> { let mut cached_current = None; let mut updated_selected_field_count = 0; - for entry in self.updated().iter_top_level_fields() { + for entry in self.only_changed_fields().iter_top_level_fields() { let (key, updated_value) = entry?; if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { @@ -241,7 +241,7 @@ impl<'doc> Update<'doc> { Ok(has_deleted_fields) } - pub fn updated_vectors( + pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 42da7766e..f2af0b229 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -199,7 +199,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { .transpose()?; let updated_geo = update - .updated() + .merged(rtxn, index, db_fields_ids_map)? .geo_field()? .map(|geo| extract_geo_coordinates(external_id, geo)) .transpose()?; diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 2a72a1650..b268647c2 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -99,7 +99,8 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { context.db_fields_ids_map, &context.doc_alloc, )?; - let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?; + let new_vectors = + update.only_changed_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { unused_vectors_distribution.append(new_vectors)?; diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 63536c559..53478f029 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -234,7 +234,7 @@ where ); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors"); let _entered = span.enter(); extract( @@ -247,7 +247,7 @@ where )?; } { - let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); + let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); for config in &mut index_embeddings { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index b65750030..890191323 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,5 +1,5 @@ use std::sync::atomic::AtomicBool; -use std::sync::RwLock; +use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; use big_s::S; @@ -33,6 +33,8 @@ mod post_processing; mod update_by_function; mod write; +static LOG_MEMORY_METRICS_ONCE: Once = Once::new(); + /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. @@ -93,6 +95,15 @@ where }, ); + LOG_MEMORY_METRICS_ONCE.call_once(|| { + tracing::debug!( + "Indexation allocated memory metrics - \ + Total BBQueue size: {total_bbbuffer_capacity}, \ + Total extractor memory: {:?}", + grenad_parameters.max_memory, + ); + }); + let (extractor_sender, writer_receiver) = pool .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) .unwrap(); @@ -179,13 +190,16 @@ where indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); - build_vectors( - index, - wtxn, - index_embeddings, - &mut arroy_writers, - &indexing_context.must_stop_processing, - )?; + pool.install(|| { + build_vectors( + index, + wtxn, + index_embeddings, + &mut arroy_writers, + &indexing_context.must_stop_processing, + ) + }) + .unwrap()?; post_processing::post_process( indexing_context, diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index d1cc2038c..707599ba3 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -72,11 +72,23 @@ pub(super) fn write_to_db( &mut aligned_embedding, )?; } + write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?; + + let direct_attempts = writer_receiver.sent_messages_attempts(); + let blocking_attempts = writer_receiver.blocking_sent_messages_attempts(); + let congestion_pct = (blocking_attempts as f64 / direct_attempts as f64) * 100.0; + tracing::debug!( + "Channel congestion metrics - \ + Attempts: {direct_attempts}, \ + Blocked attempts: {blocking_attempts} \ + ({congestion_pct:.1}% congestion)" + ); + Ok(()) } -#[tracing::instrument(level = "trace", skip_all, target = "indexing::vectors")] +#[tracing::instrument(level = "debug", skip_all, target = "indexing::vectors")] pub(super) fn build_vectors( index: &Index, wtxn: &mut RwTxn<'_>, diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 5b7fda303..16f0eef7a 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -1,7 +1,9 @@ mod v1_12; +mod v1_13; use heed::RwTxn; -use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; +use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; +use v1_13::V1_13_0_To_Current; use crate::progress::{Progress, VariableNameStep}; use crate::{Index, InternalError, Result}; @@ -26,11 +28,13 @@ pub fn upgrade( progress: Progress, ) -> Result { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; + let upgrade_functions: &[&dyn UpgradeIndex] = + &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; let start = match from { (1, 12, 0..=2) => 0, (1, 12, 3..) => 1, + (1, 13, 0) => 2, // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 13, _) => return Ok(false), (major, minor, patch) => { diff --git a/crates/milli/src/update/upgrade/v1_12.rs b/crates/milli/src/update/upgrade/v1_12.rs index e48ecfe36..f46e7f745 100644 --- a/crates/milli/src/update/upgrade/v1_12.rs +++ b/crates/milli/src/update/upgrade/v1_12.rs @@ -1,11 +1,9 @@ use heed::RwTxn; -use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use super::UpgradeIndex; use crate::progress::Progress; use crate::{make_enum_progress, Index, Result}; -use super::UpgradeIndex; - #[allow(non_camel_case_types)] pub(super) struct V1_12_To_V1_12_3 {} @@ -33,9 +31,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 { } #[allow(non_camel_case_types)] -pub(super) struct V1_12_3_To_Current(); +pub(super) struct V1_12_3_To_V1_13_0 {} -impl UpgradeIndex for V1_12_3_To_Current { +impl UpgradeIndex for V1_12_3_To_V1_13_0 { fn upgrade( &self, _wtxn: &mut RwTxn, @@ -43,14 +41,11 @@ impl UpgradeIndex for V1_12_3_To_Current { _original: (u32, u32, u32), _progress: Progress, ) -> Result { - Ok(false) + // recompute the indexes stats + Ok(true) } fn target_version(&self) -> (u32, u32, u32) { - ( - VERSION_MAJOR.parse().unwrap(), - VERSION_MINOR.parse().unwrap(), - VERSION_PATCH.parse().unwrap(), - ) + (1, 13, 0) } } diff --git a/crates/milli/src/update/upgrade/v1_13.rs b/crates/milli/src/update/upgrade/v1_13.rs new file mode 100644 index 000000000..52246a7f3 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_13.rs @@ -0,0 +1,29 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use crate::progress::Progress; +use crate::{Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_0_To_Current(); + +impl UpgradeIndex for V1_13_0_To_Current { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + ( + VERSION_MAJOR.parse().unwrap(), + VERSION_MINOR.parse().unwrap(), + VERSION_PATCH.parse().unwrap(), + ) + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 0be698027..74b52b1fe 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -410,8 +410,43 @@ impl ArroyWrapper { fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } + + pub fn aggregate_stats( + &self, + rtxn: &RoTxn, + stats: &mut ArroyStats, + ) -> Result<(), arroy::Error> { + if self.quantized { + for reader in self.readers(rtxn, self.quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } else { + for reader in self.readers(rtxn, self.angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } + + Ok(()) + } } +#[derive(Debug, Default, Clone)] +pub struct ArroyStats { + pub number_of_embeddings: u64, + pub documents: RoaringBitmap, +} /// One or multiple embeddings stored consecutively in a flat vector. pub struct Embeddings { data: Vec, @@ -611,6 +646,7 @@ impl Embedder { } } + #[tracing::instrument(level = "debug", skip_all, target = "search")] pub fn embed_one( &self, text: String, diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index cc70e2c47..d2a80d6b5 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -5,7 +5,7 @@ use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::DistributionShift; +use super::{DistributionShift, REQUEST_PARALLELISM}; use crate::error::FaultSource; use crate::vector::Embedding; use crate::ThreadPoolNoAbort; @@ -118,14 +118,20 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub(crate) fn embed_chunks_ref( @@ -133,20 +139,32 @@ impl Embedder { texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 938c04fe3..c7aec5d93 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -7,7 +7,7 @@ use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::DistributionShift; +use super::{DistributionShift, REQUEST_PARALLELISM}; use crate::error::FaultSource; use crate::vector::error::EmbedErrorKind; use crate::vector::Embedding; @@ -255,14 +255,20 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub(crate) fn embed_chunks_ref( @@ -270,20 +276,31 @@ impl Embedder { texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index eb05bac64..467169d9c 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -130,6 +130,7 @@ impl Embedder { let client = ureq::AgentBuilder::new() .max_idle_connections(REQUEST_PARALLELISM * 2) .max_idle_connections_per_host(REQUEST_PARALLELISM * 2) + .timeout(std::time::Duration::from_secs(30)) .build(); let request = Request::new(options.request)?; @@ -188,14 +189,20 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub(crate) fn embed_chunks_ref( @@ -203,20 +210,32 @@ impl Embedder { texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed_ref(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed_ref(chunk, None)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 4a1b1882c..86028c1c4 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -455,7 +455,7 @@ impl EmbeddingSettings { EmbedderSource::Ollama, EmbedderSource::Rest, ], - Self::DOCUMENT_TEMPLATE => &[ + Self::DOCUMENT_TEMPLATE | Self::DOCUMENT_TEMPLATE_MAX_BYTES => &[ EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama, @@ -490,6 +490,7 @@ impl EmbeddingSettings { Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE, + Self::DOCUMENT_TEMPLATE_MAX_BYTES, Self::DIMENSIONS, Self::DISTRIBUTION, Self::URL, @@ -500,6 +501,7 @@ impl EmbeddingSettings { Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE, + Self::DOCUMENT_TEMPLATE_MAX_BYTES, Self::DISTRIBUTION, Self::BINARY_QUANTIZED, ], @@ -507,6 +509,7 @@ impl EmbeddingSettings { Self::SOURCE, Self::MODEL, Self::DOCUMENT_TEMPLATE, + Self::DOCUMENT_TEMPLATE_MAX_BYTES, Self::URL, Self::API_KEY, Self::DIMENSIONS, @@ -521,6 +524,7 @@ impl EmbeddingSettings { Self::API_KEY, Self::DIMENSIONS, Self::DOCUMENT_TEMPLATE, + Self::DOCUMENT_TEMPLATE_MAX_BYTES, Self::URL, Self::REQUEST, Self::RESPONSE, diff --git a/workloads/embeddings-movies-subset-hf.json b/workloads/embeddings-movies-subset-hf.json index d7672cf73..4f6c5be35 100644 --- a/workloads/embeddings-movies-subset-hf.json +++ b/workloads/embeddings-movies-subset-hf.json @@ -12,16 +12,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/embeddings-settings-add.json b/workloads/embeddings-settings-add.json index 6ad50769a..67f9709db 100644 --- a/workloads/embeddings-settings-add.json +++ b/workloads/embeddings-settings-add.json @@ -12,16 +12,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json index f4171442f..5c6acf626 100644 --- a/workloads/hackernews-modify-facet-numbers.json +++ b/workloads/hackernews-modify-facet-numbers.json @@ -28,10 +28,10 @@ "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" }, - "hackernews-02-modified-filters.ndjson": { + "hackernews-modified-number-filters.ndjson": { "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", - "sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802" + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", + "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" } }, "precommands": [ @@ -102,7 +102,7 @@ "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-02-modified-filters.ndjson" + "asset": "hackernews-modified-number-filters.ndjson" }, "synchronous": "WaitForTask" } diff --git a/workloads/hackernews-modify-facet-strings.json b/workloads/hackernews-modify-facet-strings.json index 7c5eb2e70..b5d4235a0 100644 --- a/workloads/hackernews-modify-facet-strings.json +++ b/workloads/hackernews-modify-facet-strings.json @@ -28,10 +28,10 @@ "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" }, - "hackernews-01-modified-filters.ndjson": { + "hackernews-modified-string-filters.ndjson": { "local_location": null, - "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", - "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", + "sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802" } }, "precommands": [ @@ -102,7 +102,7 @@ "route": "indexes/movies/documents", "method": "POST", "body": { - "asset": "hackernews-01-modified-filters.ndjson" + "asset": "hackernews-modified-string-filters.ndjson" }, "synchronous": "WaitForTask" } diff --git a/workloads/search/embeddings-movies-subset-hf.json b/workloads/search/embeddings-movies-subset-hf.json index 36f45cfb9..720d41790 100644 --- a/workloads/search/embeddings-movies-subset-hf.json +++ b/workloads/search/embeddings-movies-subset-hf.json @@ -13,16 +13,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH",