From 8a54f14b8e74ba4272367a83bb65ec22ddf95a70 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 11:38:21 +0100 Subject: [PATCH 01/18] Demote panic to error log --- crates/milli/src/update/new/indexer/write.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 01748cf0d..92ffa19d5 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -153,10 +153,11 @@ pub fn write_from_bbqueue( } (key, None) => match database.delete(wtxn, key) { Ok(false) => { - unreachable!( - "We tried to delete an unknown key from {database_name}: {:?}", - key.as_bstr() - ) + tracing::error!( + database_name, + key = %key.as_bstr(), + "Attempt to delete an unknown key" + ); } Ok(_) => (), Err(error) => { From a6470a0c37821535a4799f7421b54e8862225c9a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 14:33:53 +0100 Subject: [PATCH 02/18] Improve error log --- crates/milli/src/update/new/indexer/write.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 92ffa19d5..d1cc2038c 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -155,7 +155,8 @@ pub fn write_from_bbqueue( Ok(false) => { tracing::error!( database_name, - key = %key.as_bstr(), + key_bytes = ?key, + formatted_key = ?key.as_bstr(), "Attempt to delete an unknown key" ); } From 805531c90d78df85ca3d7a3be762bf545e5cf77a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 16:54:44 +0100 Subject: [PATCH 03/18] Do not explode on missing content file if the task has no docs --- crates/index-scheduler/src/dump.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 643255ac2..ed3bb7c12 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -39,6 +39,8 @@ impl<'a> Dump<'a> { task: TaskDump, content_file: Option>, ) -> Result { + let task_has_no_docs = matches!(task.kind, KindDump::DocumentImport { documents_count, .. } if documents_count == 0); + let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?; @@ -54,6 +56,14 @@ impl<'a> Dump<'a> { // If the task isn't `Enqueued` then just generate a recognisable `Uuid` // in case we try to open it later. _ if task.status != Status::Enqueued => Some(Uuid::nil()), + None if task.status == Status::Enqueued && task_has_no_docs => { + let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?; + let builder = DocumentsBatchBuilder::new(&mut file); + builder.into_inner()?; + file.persist()?; + + Some(uuid) + } _ => None, }; From 0e0e462f5bc8b8ca7ef0949b02accc8e031f97e4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 16:55:04 +0100 Subject: [PATCH 04/18] Also fix dump import from meilitool --- crates/meilitool/src/main.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 44eb4960e..053e83b96 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -254,18 +254,15 @@ fn export_a_dump( if status == Status::Enqueued { let content_file = file_store.get_update(content_file_uuid)?; - let reader = - DocumentsBatchReader::from_reader(content_file).with_context(|| { + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() + { + let document = document.with_context(|| { format!("While reading content file {:?}", content_file_uuid) })?; - - let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - while let Some(doc) = cursor.next_document().with_context(|| { - format!("While iterating on content file {:?}", content_file_uuid) - })? { - dump_content_file - .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; + dump_content_file.push_document(&document)?; } + dump_content_file.flush()?; count += 1; } From a8006a3750f7dc3153302a6ce0496383f837311a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 16 Jan 2025 18:05:29 +0100 Subject: [PATCH 05/18] Change format of update file when importing dump --- crates/index-scheduler/src/dump.rs | 13 +++++++++---- crates/meilitool/src/main.rs | 1 - 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index ed3bb7c12..aaf4d83e3 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -1,7 +1,9 @@ use std::collections::HashMap; +use std::io; use dump::{KindDump, TaskDump, UpdateFile}; use meilisearch_types::heed::RwTxn; +use meilisearch_types::milli; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use roaring::RoaringBitmap; @@ -43,12 +45,15 @@ impl<'a> Dump<'a> { let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { - let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?; - let mut builder = DocumentsBatchBuilder::new(&mut file); + let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?; + let mut writer = io::BufWriter::new(file); for doc in content_file { - builder.append_json_object(&doc?)?; + let doc = doc?; + serde_json::to_writer(&mut writer, &doc).map_err(|e| { + Error::from_milli(milli::InternalError::SerdeJson(e).into(), None) + })?; } - builder.into_inner()?; + let file = writer.into_inner().map_err(|e| e.into_error())?; file.persist()?; Some(uuid) diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 053e83b96..0f7702f9d 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -9,7 +9,6 @@ use file_store::FileStore; use meilisearch_auth::AuthController; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::versioning::{get_version, parse_version}; From 6a6212d4e1dce41e3794c2887bfb270b3cf28dac Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 18:28:32 +0100 Subject: [PATCH 06/18] Fix warnings --- crates/index-scheduler/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 3c50283d9..04374e65f 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -51,8 +51,8 @@ use flate2::Compression; use meilisearch_types::batches::Batch; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; -use meilisearch_types::heed::types::I128; -use meilisearch_types::heed::{self, Env, RoTxn}; +use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; +use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; From 59242b9c4f26dac33d08bf85c6bfb6f15534a003 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jan 2025 11:04:29 +0100 Subject: [PATCH 07/18] Fix warnings --- crates/index-scheduler/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 04374e65f..3c50283d9 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -51,8 +51,8 @@ use flate2::Compression; use meilisearch_types::batches::Batch; use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; -use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; -use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; +use meilisearch_types::heed::types::I128; +use meilisearch_types::heed::{self, Env, RoTxn}; use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; From 2cf57d584ed14408403da660a38c705c2efb3fff Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 18:28:43 +0100 Subject: [PATCH 08/18] Handle empty payloads --- crates/index-scheduler/src/dump.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index aaf4d83e3..e1bad51d5 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -63,8 +63,6 @@ impl<'a> Dump<'a> { _ if task.status != Status::Enqueued => Some(Uuid::nil()), None if task.status == Status::Enqueued && task_has_no_docs => { let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?; - let builder = DocumentsBatchBuilder::new(&mut file); - builder.into_inner()?; file.persist()?; Some(uuid) From 909d84447dca335675f92ed331900e5a9d2c1d5c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 20 Jan 2025 10:09:02 +0100 Subject: [PATCH 09/18] meilitool dumps old-style dump for older DBs, otherwise new-style --- crates/meilitool/src/main.rs | 43 +++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 0f7702f9d..743fe552e 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -9,6 +9,7 @@ use file_store::FileStore; use meilisearch_auth::AuthController; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::versioning::{get_version, parse_version}; @@ -87,7 +88,7 @@ fn main() -> anyhow::Result<()> { match command { Command::ClearTaskQueue => clear_task_queue(db_path), Command::ExportADump { dump_dir, skip_enqueued_tasks } => { - export_a_dump(db_path, dump_dir, skip_enqueued_tasks) + export_a_dump(db_path, dump_dir, skip_enqueued_tasks, detected_version) } Command::OfflineUpgrade { target_version } => { let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?; @@ -186,6 +187,7 @@ fn export_a_dump( db_path: PathBuf, dump_dir: PathBuf, skip_enqueued_tasks: bool, + detected_version: (String, String, String), ) -> Result<(), anyhow::Error> { let started_at = OffsetDateTime::now_utc(); @@ -237,9 +239,6 @@ fn export_a_dump( if skip_enqueued_tasks { eprintln!("Skip dumping the enqueued tasks..."); } else { - eprintln!("Dumping the enqueued tasks..."); - - // 3. dump the tasks let mut dump_tasks = dump.create_tasks_queue()?; let mut count = 0; for ret in all_tasks.iter(&rtxn)? { @@ -253,13 +252,37 @@ fn export_a_dump( if status == Status::Enqueued { let content_file = file_store.get_update(content_file_uuid)?; - for document in - serde_json::de::Deserializer::from_reader(content_file).into_iter() + if ( + detected_version.0.as_str(), + detected_version.1.as_str(), + detected_version.2.as_str(), + ) < ("1", "12", "0") { - let document = document.with_context(|| { - format!("While reading content file {:?}", content_file_uuid) - })?; - dump_content_file.push_document(&document)?; + eprintln!("Dumping the enqueued tasks reading them in obkv format..."); + let reader = + DocumentsBatchReader::from_reader(content_file).with_context(|| { + format!("While reading content file {:?}", content_file_uuid) + })?; + let (mut cursor, documents_batch_index) = + reader.into_cursor_and_fields_index(); + while let Some(doc) = cursor.next_document().with_context(|| { + format!("While iterating on content file {:?}", content_file_uuid) + })? { + dump_content_file + .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; + } + } else { + eprintln!( + "Dumping the enqueued tasks reading them in JSON stream format..." + ); + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() + { + let document = document.with_context(|| { + format!("While reading content file {:?}", content_file_uuid) + })?; + dump_content_file.push_document(&document)?; + } } dump_content_file.flush()?; From c0690f5b9edba40c49cca6ebac4703319c908781 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 20 Jan 2025 10:32:20 +0100 Subject: [PATCH 10/18] Make offline upgrade more flexible --- crates/meilitool/src/upgrade/mod.rs | 49 +++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 47ca2cbd9..51cb5f454 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -20,6 +20,34 @@ pub struct OfflineUpgrade { impl OfflineUpgrade { pub fn upgrade(self) -> anyhow::Result<()> { + // Adding a version? + // + // 1. Update the LAST_SUPPORTED_UPGRADE_FROM_VERSION and LAST_SUPPORTED_UPGRADE_TO_VERSION. + // 2. Add new version to the upgrade list if necessary + // 3. Use `no_upgrade` as index for versions that are compatible. + + if self.current_version == self.target_version { + println!("Database is already at the target version. Exiting."); + return Ok(()); + } + + if self.current_version > self.target_version { + bail!( + "Cannot downgrade from {}.{}.{} to {}.{}.{}. Downgrade not supported", + self.current_version.0, + self.current_version.1, + self.current_version.2, + self.target_version.0, + self.target_version.1, + self.target_version.2 + ); + } + + const FIRST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.9.0"; + const LAST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.12.5"; + const FIRST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.10.0"; + const LAST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.12.5"; + let upgrade_list = [ ( v1_9_to_v1_10 as fn(&Path, &str, &str, &str) -> Result<(), anyhow::Error>, @@ -32,6 +60,8 @@ impl OfflineUpgrade { (v1_12_to_v1_12_3, "1", "12", "3"), ]; + let no_upgrade: usize = upgrade_list.len(); + let (current_major, current_minor, current_patch) = &self.current_version; let start_at = match ( @@ -43,8 +73,11 @@ impl OfflineUpgrade { ("1", "10", _) => 1, ("1", "11", _) => 2, ("1", "12", x) if x == "0" || x == "1" || x == "2" => 3, + ("1", "12", x) if x == "3" || x == "4" || x == "5" => no_upgrade, _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from versions in range [{}-{}]", + FIRST_SUPPORTED_UPGRADE_FROM_VERSION, + LAST_SUPPORTED_UPGRADE_FROM_VERSION); } }; @@ -54,17 +87,27 @@ impl OfflineUpgrade { ("1", "10", _) => 0, ("1", "11", _) => 1, ("1", "12", x) if x == "0" || x == "1" || x == "2" => 2, - ("1", "12", "3") => 3, + ("1", "12", x) if x == "3" || x == "4" || x == "5" => 3, (major, _, _) if major.starts_with('v') => { bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") } _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11") + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to versions in range [{}-{}]", + FIRST_SUPPORTED_UPGRADE_TO_VERSION, + LAST_SUPPORTED_UPGRADE_TO_VERSION); } }; println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + if start_at == no_upgrade { + println!("No upgrade operation to perform, writing VERSION file"); + create_version_file(&self.db_path, target_major, target_minor, target_patch) + .context("while writing VERSION file after the upgrade")?; + println!("Success"); + return Ok(()); + } + #[allow(clippy::needless_range_loop)] for index in start_at..=ends_at { let (func, major, minor, patch) = upgrade_list[index]; From d95384a636789b35224e70e16dcf84e5bb44f44e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 20 Jan 2025 11:16:18 +0100 Subject: [PATCH 11/18] Remove batch ids on export --- crates/index-scheduler/src/dump.rs | 3 +-- .../index-scheduler/src/scheduler/process_dump_creation.rs | 7 +++++++ crates/meilitool/src/main.rs | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index e1bad51d5..2dde6d623 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -4,7 +4,6 @@ use std::io; use dump::{KindDump, TaskDump, UpdateFile}; use meilisearch_types::heed::RwTxn; use meilisearch_types::milli; -use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use roaring::RoaringBitmap; use uuid::Uuid; @@ -62,7 +61,7 @@ impl<'a> Dump<'a> { // in case we try to open it later. _ if task.status != Status::Enqueued => Some(Uuid::nil()), None if task.status == Status::Enqueued && task_has_no_docs => { - let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?; + let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?; file.persist()?; Some(uuid) diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index 3fd5c795b..6b580abeb 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -72,6 +72,13 @@ impl IndexScheduler { t.started_at = Some(started_at); t.finished_at = Some(finished_at); } + + // Patch the task to remove the batch uid, because as of v1.12.5 batches are not persisted. + // This prevent from referencing *future* batches not actually associated with the task. + // + // See for details. + t.batch_uid = None; + let mut dump_content_file = dump_tasks.push_task(&t.into())?; // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 743fe552e..599bc3274 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -245,6 +245,7 @@ fn export_a_dump( let (_, t) = ret?; let status = t.status; let content_file = t.content_uuid(); + let mut dump_content_file = dump_tasks.push_task(&t.into())?; // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. From 2e04ab4737666c8baacd5e323043a5a4a2bbf2f1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 20 Jan 2025 11:45:03 +0100 Subject: [PATCH 12/18] Replace guards by OR patterns Co-authored-by: Tamo --- crates/meilitool/src/upgrade/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 51cb5f454..2d5230341 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -72,8 +72,8 @@ impl OfflineUpgrade { ("1", "9", _) => 0, ("1", "10", _) => 1, ("1", "11", _) => 2, - ("1", "12", x) if x == "0" || x == "1" || x == "2" => 3, - ("1", "12", x) if x == "3" || x == "4" || x == "5" => no_upgrade, + ("1", "12", "0" | "1" | "2") => 3, + ("1", "12", "3" | "4" | "5") => no_upgrade, _ => { bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from versions in range [{}-{}]", FIRST_SUPPORTED_UPGRADE_FROM_VERSION, @@ -86,8 +86,8 @@ impl OfflineUpgrade { let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { ("1", "10", _) => 0, ("1", "11", _) => 1, - ("1", "12", x) if x == "0" || x == "1" || x == "2" => 2, - ("1", "12", x) if x == "3" || x == "4" || x == "5" => 3, + ("1", "12", "0" | "1" | "2") => 2, + ("1", "12", "3" | "4" | "5") => 3, (major, _, _) if major.starts_with('v') => { bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") } From d6063079affb6e8b50364ff7ea4ac5fdbfae82dd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 21 Jan 2025 00:11:50 +0100 Subject: [PATCH 13/18] Unify facet strings by their normalized value --- .../new/extract/faceted/extract_facets.rs | 86 +++++++++++-------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 7e0484e39..41b6a12a2 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -283,42 +283,60 @@ impl FacetedDocidsExtractor { } struct DelAddFacetValue<'doc> { - strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + strings: HashMap< + (FieldId, &'doc str), + Option>, + hashbrown::DefaultHashBuilder, + &'doc Bump, + >, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + doc_alloc: &'doc Bump, } impl<'doc> DelAddFacetValue<'doc> { fn new(doc_alloc: &'doc Bump) -> Self { - Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } + Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc } } fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Deletion) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Addition); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Deletion) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Addition); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), Some(value)); + } + } + _ => (), } } fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Addition) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Deletion); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Addition) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Deletion); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), None); + } + } + _ => (), } } @@ -329,18 +347,14 @@ impl<'doc> DelAddFacetValue<'doc> { doc_alloc: &Bump, ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); - for ((fid, value), deladd) in self.strings { - if let Ok(s) = std::str::from_utf8(&value) { - buffer.clear(); - buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.extend_from_slice(&docid.to_be_bytes()); - let normalized = crate::normalize_facet(s); - let truncated = truncate_str(&normalized); - buffer.extend_from_slice(truncated.as_bytes()); - match deladd { - DelAdd::Deletion => sender.delete_facet_string(&buffer)?, - DelAdd::Addition => sender.write_facet_string(&buffer, &value)?, - } + for ((fid, truncated), value) in self.strings { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + buffer.extend_from_slice(truncated.as_bytes()); + match &value { + Some(value) => sender.write_facet_string(&buffer, value)?, + None => sender.delete_facet_string(&buffer)?, } } From 4d4683adb6195437dec8104e4ea7412a04fe20ec Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jan 2025 10:41:43 +0100 Subject: [PATCH 14/18] Add a test to check the facet casing is good --- crates/meilisearch/tests/search/mod.rs | 54 ++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 8f8dabb78..a7ec35270 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -1524,3 +1524,57 @@ async fn change_attributes_settings() { ) .await; } + +/// Modifying facets with different casing should work correctly +#[actix_rt::test] +async fn change_facet_casing() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "filterableAttributes": ["dog"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "Bouvier Bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "bouvier bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + index + .search(json!({ "facets": ["dog"] }), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["facetDistribution"]), @r###" + { + "dog": { + "bouvier bernois": 1 + } + } + "###); + }) + .await; +} From d142c5e432ca944de62fe4eb0d3c26e674801143 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jan 2025 11:44:03 +0100 Subject: [PATCH 15/18] Do not panic when the facet string is not found --- .../src/search/facet/facet_distribution.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index 027f8b176..ee0fad535 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -219,12 +219,19 @@ impl<'a> FacetDistribution<'a> { let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); - let original_string = self - .index - .field_id_docid_facet_strings - .get(self.rtxn, &key)? - .unwrap() - .to_owned(); + let optional_original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?; + + let original_string = match optional_original_string { + Some(original_string) => original_string.to_owned(), + None => { + tracing::error!( + "Missing original facet string. Using the normalized facet {} instead", + facet_key + ); + facet_key.to_string() + } + }; distribution.insert(original_string, nbr_docids); if distribution.len() == self.max_values_per_facet { From b9d92c481b6ac9db797ca56bae6670ef08042c95 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jan 2025 12:22:56 +0000 Subject: [PATCH 16/18] Update version for the next release (v1.12.6) in Cargo.toml --- crates/meilisearch/Cargo.toml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 1d458af34..1baff114f 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -105,8 +105,16 @@ tracing-actix-web = "0.7.15" build-info = { version = "1.7.0", path = "../build-info" } roaring = "0.10.10" mopa-maintained = "0.2.3" -utoipa = { version = "5.3.1", features = ["actix_extras", "macros", "non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"] } -utoipa-scalar = { version = "0.2.1", optional = true, features = ["actix-web"] } +utoipa = { version = "5.3.1", features = [ + "actix_extras", + "macros", + "non_strict_integers", + "preserve_order", + "uuid", + "time", + "openapi_extensions", +] } +utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] } [dev-dependencies] actix-rt = "2.10.0" From 50fca8fc7063f053f7bc75d396e4e1852d08e929 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 16 Jan 2025 16:54:05 +0100 Subject: [PATCH 17/18] Create update files in new format --- .../src/scheduler/process_dump_creation.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index 6b580abeb..9691bdbef 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -4,7 +4,6 @@ use std::sync::atomic::Ordering; use dump::IndexMetadata; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::progress::Progress; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self}; @@ -89,19 +88,15 @@ impl IndexScheduler { if status == Status::Enqueued { let content_file = self.queue.file_store.get_update(content_file)?; - let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(|e| Error::from_milli(e.into(), None))?; - - let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - - while let Some(doc) = - cursor.next_document().map_err(|e| Error::from_milli(e.into(), None))? + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() { - dump_content_file.push_document( - &obkv_to_object(doc, &documents_batch_index) - .map_err(|e| Error::from_milli(e, None))?, - )?; + let document = document.map_err(|e| { + Error::from_milli(milli::InternalError::SerdeJson(e).into(), None) + })?; + dump_content_file.push_document(&document)?; } + dump_content_file.flush()?; } } From 2c099b7c23bd7074f47f4c28c19a9929f0a94380 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 22 Jan 2025 15:53:52 +0100 Subject: [PATCH 18/18] Update Cargo.lock again --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3200d8333..e251776c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6231,9 +6231,9 @@ dependencies = [ [[package]] name = "utoipa-scalar" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "088e93bf19f6bd06e0aacb02ca432b3c5a449c4aec2e4aa9fc333a667f2b2c55" +checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719" dependencies = [ "actix-web", "serde",