Merge #5262

5262: Bring back changes from v1.12.4, v1.12.5, and v1.12.6 into main r=dureuill a=Kerollmops This PR follows [this guideline to bring back changes after we worked on v1.12.4, v1.12.5, and v1.12.6](https://github.com/meilisearch/engine-team/blob/main/resources/meilisearch-release.md#after-the-release-bring-back-changes-to-main). Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: Kerollmops <Kerollmops@users.noreply.github.com>
2025-07-04 20:37:15 +02:00 · 2025-01-22 14:55:02 +00:00 · 2025-01-22 14:55:02 +00:00 · 6723700fb9
commit 6723700fb9
parent be5e521cb0 2c099b7c23
10 changed files with 249 additions and 87 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -6231,9 +6231,9 @@ dependencies = [

 [[package]]
 name = "utoipa-scalar"
-version = "0.2.1"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "088e93bf19f6bd06e0aacb02ca432b3c5a449c4aec2e4aa9fc333a667f2b2c55"
+checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719"
 dependencies = [
 "actix-web",
 "serde",
--- a/crates/index-scheduler/src/dump.rs
+++ b/crates/index-scheduler/src/dump.rs
@ -1,8 +1,9 @@
 use std::collections::HashMap;
+use std::io;

 use dump::{KindDump, TaskDump, UpdateFile};
 use meilisearch_types::heed::RwTxn;
-use meilisearch_types::milli::documents::DocumentsBatchBuilder;
+use meilisearch_types::milli;
 use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
 use roaring::RoaringBitmap;
 use uuid::Uuid;
@ -39,14 +40,19 @@ impl<'a> Dump<'a> {
        task: TaskDump,
        content_file: Option<Box<UpdateFile>>,
    ) -> Result<Task> {
+        let task_has_no_docs = matches!(task.kind, KindDump::DocumentImport { documents_count, .. } if documents_count == 0);
+
        let content_uuid = match content_file {
            Some(content_file) if task.status == Status::Enqueued => {
-                let (uuid, mut file) = self.index_scheduler.queue.create_update_file(false)?;
-                let mut builder = DocumentsBatchBuilder::new(&mut file);
+                let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?;
+                let mut writer = io::BufWriter::new(file);
                for doc in content_file {
-                    builder.append_json_object(&doc?)?;
+                    let doc = doc?;
+                    serde_json::to_writer(&mut writer, &doc).map_err(|e| {
+                        Error::from_milli(milli::InternalError::SerdeJson(e).into(), None)
+                    })?;
                }
-                builder.into_inner()?;
+                let file = writer.into_inner().map_err(|e| e.into_error())?;
                file.persist()?;

                Some(uuid)
@ -54,6 +60,12 @@ impl<'a> Dump<'a> {
            // If the task isn't `Enqueued` then just generate a recognisable `Uuid`
            // in case we try to open it later.
            _ if task.status != Status::Enqueued => Some(Uuid::nil()),
+            None if task.status == Status::Enqueued && task_has_no_docs => {
+                let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?;
+                file.persist()?;
+
+                Some(uuid)
+            }
            _ => None,
        };

--- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs
+++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs
@ -4,7 +4,6 @@ use std::sync::atomic::Ordering;

 use dump::IndexMetadata;
 use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
-use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
 use meilisearch_types::milli::progress::Progress;
 use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
 use meilisearch_types::milli::{self};
@ -72,6 +71,13 @@ impl IndexScheduler {
                t.started_at = Some(started_at);
                t.finished_at = Some(finished_at);
            }
+
+            // Patch the task to remove the batch uid, because as of v1.12.5 batches are not persisted.
+            // This prevent from referencing *future* batches not actually associated with the task.
+            //
+            // See <https://github.com/meilisearch/meilisearch/issues/5247> for details.
+            t.batch_uid = None;
+
            let mut dump_content_file = dump_tasks.push_task(&t.into())?;

            // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
@ -82,19 +88,15 @@ impl IndexScheduler {
                if status == Status::Enqueued {
                    let content_file = self.queue.file_store.get_update(content_file)?;

-                    let reader = DocumentsBatchReader::from_reader(content_file)
-                        .map_err(|e| Error::from_milli(e.into(), None))?;
-
-                    let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
-
-                    while let Some(doc) =
-                        cursor.next_document().map_err(|e| Error::from_milli(e.into(), None))?
+                    for document in
+                        serde_json::de::Deserializer::from_reader(content_file).into_iter()
                    {
-                        dump_content_file.push_document(
-                            &obkv_to_object(doc, &documents_batch_index)
-                                .map_err(|e| Error::from_milli(e, None))?,
-                        )?;
+                        let document = document.map_err(|e| {
+                            Error::from_milli(milli::InternalError::SerdeJson(e).into(), None)
+                        })?;
+                        dump_content_file.push_document(&document)?;
                    }
+
                    dump_content_file.flush()?;
                }
            }
--- a/crates/meilisearch/Cargo.toml
+++ b/crates/meilisearch/Cargo.toml
@ -105,8 +105,16 @@ tracing-actix-web = "0.7.15"
 build-info = { version = "1.7.0", path = "../build-info" }
 roaring = "0.10.10"
 mopa-maintained = "0.2.3"
-utoipa = { version = "5.3.1", features = ["actix_extras", "macros", "non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"] }
-utoipa-scalar = { version = "0.2.1", optional = true, features = ["actix-web"] }
+utoipa = { version = "5.3.1", features = [
+    "actix_extras",
+    "macros",
+    "non_strict_integers",
+    "preserve_order",
+    "uuid",
+    "time",
+    "openapi_extensions",
+] }
+utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] }

 [dev-dependencies]
 actix-rt = "2.10.0"
--- a/crates/meilisearch/tests/search/mod.rs
+++ b/crates/meilisearch/tests/search/mod.rs
@ -1524,3 +1524,57 @@ async fn change_attributes_settings() {
        )
        .await;
 }
+
+/// Modifying facets with different casing should work correctly
+#[actix_rt::test]
+async fn change_facet_casing() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let (response, code) = index
+        .update_settings(json!({
+            "filterableAttributes": ["dog"],
+        }))
+        .await;
+    assert_eq!("202", code.as_str(), "{:?}", response);
+    index.wait_task(response.uid()).await;
+
+    let (response, _code) = index
+        .add_documents(
+            json!([
+                {
+                    "id": 1,
+                    "dog": "Bouvier Bernois"
+                }
+            ]),
+            None,
+        )
+        .await;
+    index.wait_task(response.uid()).await;
+
+    let (response, _code) = index
+        .add_documents(
+            json!([
+                {
+                    "id": 1,
+                    "dog": "bouvier bernois"
+                }
+            ]),
+            None,
+        )
+        .await;
+    index.wait_task(response.uid()).await;
+
+    index
+        .search(json!({ "facets": ["dog"] }), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["facetDistribution"]), @r###"
+            {
+              "dog": {
+                "bouvier bernois": 1
+              }
+            }
+            "###);
+        })
+        .await;
+}
--- a/crates/meilitool/src/main.rs
+++ b/crates/meilitool/src/main.rs
@ -88,7 +88,7 @@ fn main() -> anyhow::Result<()> {
    match command {
        Command::ClearTaskQueue => clear_task_queue(db_path),
        Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
-            export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
+            export_a_dump(db_path, dump_dir, skip_enqueued_tasks, detected_version)
        }
        Command::OfflineUpgrade { target_version } => {
            let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?;
@ -187,6 +187,7 @@ fn export_a_dump(
    db_path: PathBuf,
    dump_dir: PathBuf,
    skip_enqueued_tasks: bool,
+    detected_version: (String, String, String),
 ) -> Result<(), anyhow::Error> {
    let started_at = OffsetDateTime::now_utc();

@ -238,15 +239,13 @@ fn export_a_dump(
    if skip_enqueued_tasks {
        eprintln!("Skip dumping the enqueued tasks...");
    } else {
-        eprintln!("Dumping the enqueued tasks...");
-
-        // 3. dump the tasks
        let mut dump_tasks = dump.create_tasks_queue()?;
        let mut count = 0;
        for ret in all_tasks.iter(&rtxn)? {
            let (_, t) = ret?;
            let status = t.status;
            let content_file = t.content_uuid();
+
            let mut dump_content_file = dump_tasks.push_task(&t.into())?;

            // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
@ -254,18 +253,39 @@ fn export_a_dump(
                if status == Status::Enqueued {
                    let content_file = file_store.get_update(content_file_uuid)?;

-                    let reader =
-                        DocumentsBatchReader::from_reader(content_file).with_context(|| {
-                            format!("While reading content file {:?}", content_file_uuid)
-                        })?;
-
-                    let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
-                    while let Some(doc) = cursor.next_document().with_context(|| {
-                        format!("While iterating on content file {:?}", content_file_uuid)
-                    })? {
-                        dump_content_file
-                            .push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
+                    if (
+                        detected_version.0.as_str(),
+                        detected_version.1.as_str(),
+                        detected_version.2.as_str(),
+                    ) < ("1", "12", "0")
+                    {
+                        eprintln!("Dumping the enqueued tasks reading them in obkv format...");
+                        let reader =
+                            DocumentsBatchReader::from_reader(content_file).with_context(|| {
+                                format!("While reading content file {:?}", content_file_uuid)
+                            })?;
+                        let (mut cursor, documents_batch_index) =
+                            reader.into_cursor_and_fields_index();
+                        while let Some(doc) = cursor.next_document().with_context(|| {
+                            format!("While iterating on content file {:?}", content_file_uuid)
+                        })? {
+                            dump_content_file
+                                .push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
+                        }
+                    } else {
+                        eprintln!(
+                            "Dumping the enqueued tasks reading them in JSON stream format..."
+                        );
+                        for document in
+                            serde_json::de::Deserializer::from_reader(content_file).into_iter()
+                        {
+                            let document = document.with_context(|| {
+                                format!("While reading content file {:?}", content_file_uuid)
+                            })?;
+                            dump_content_file.push_document(&document)?;
+                        }
                    }
+
                    dump_content_file.flush()?;
                    count += 1;
                }
--- a/crates/meilitool/src/upgrade/mod.rs
+++ b/crates/meilitool/src/upgrade/mod.rs
@ -20,6 +20,34 @@ pub struct OfflineUpgrade {

 impl OfflineUpgrade {
    pub fn upgrade(self) -> anyhow::Result<()> {
+        // Adding a version?
+        //
+        // 1. Update the LAST_SUPPORTED_UPGRADE_FROM_VERSION and LAST_SUPPORTED_UPGRADE_TO_VERSION.
+        // 2. Add new version to the upgrade list if necessary
+        // 3. Use `no_upgrade` as index for versions that are compatible.
+
+        if self.current_version == self.target_version {
+            println!("Database is already at the target version. Exiting.");
+            return Ok(());
+        }
+
+        if self.current_version > self.target_version {
+            bail!(
+                "Cannot downgrade from {}.{}.{} to {}.{}.{}. Downgrade not supported",
+                self.current_version.0,
+                self.current_version.1,
+                self.current_version.2,
+                self.target_version.0,
+                self.target_version.1,
+                self.target_version.2
+            );
+        }
+
+        const FIRST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.9.0";
+        const LAST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.12.5";
+        const FIRST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.10.0";
+        const LAST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.12.5";
+
        let upgrade_list = [
            (
                v1_9_to_v1_10 as fn(&Path, &str, &str, &str) -> Result<(), anyhow::Error>,
@ -32,6 +60,8 @@ impl OfflineUpgrade {
            (v1_12_to_v1_12_3, "1", "12", "3"),
        ];

+        let no_upgrade: usize = upgrade_list.len();
+
        let (current_major, current_minor, current_patch) = &self.current_version;

        let start_at = match (
@ -42,9 +72,12 @@ impl OfflineUpgrade {
            ("1", "9", _) => 0,
            ("1", "10", _) => 1,
            ("1", "11", _) => 2,
-            ("1", "12", x) if x == "0" || x == "1" || x == "2" => 3,
+            ("1", "12", "0" | "1" | "2") => 3,
+            ("1", "12", "3" | "4" | "5") => no_upgrade,
            _ => {
-                bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
+                bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from versions in range [{}-{}]",
+                      FIRST_SUPPORTED_UPGRADE_FROM_VERSION,
+                      LAST_SUPPORTED_UPGRADE_FROM_VERSION);
            }
        };

@ -53,18 +86,28 @@ impl OfflineUpgrade {
        let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
            ("1", "10", _) => 0,
            ("1", "11", _) => 1,
-            ("1", "12", x) if x == "0" || x == "1" || x == "2" => 2,
-            ("1", "12", "3") => 3,
+            ("1", "12", "0" | "1" | "2") => 2,
+            ("1", "12", "3" | "4" | "5") => 3,
            (major, _, _) if major.starts_with('v') => {
                bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
            }
            _ => {
-                bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11")
+                bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to versions in range [{}-{}]",
+                      FIRST_SUPPORTED_UPGRADE_TO_VERSION,
+                      LAST_SUPPORTED_UPGRADE_TO_VERSION);
            }
        };

        println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}");

+        if start_at == no_upgrade {
+            println!("No upgrade operation to perform, writing VERSION file");
+            create_version_file(&self.db_path, target_major, target_minor, target_patch)
+                .context("while writing VERSION file after the upgrade")?;
+            println!("Success");
+            return Ok(());
+        }
+
        #[allow(clippy::needless_range_loop)]
        for index in start_at..=ends_at {
            let (func, major, minor, patch) = upgrade_list[index];
--- a/crates/milli/src/search/facet/facet_distribution.rs
+++ b/crates/milli/src/search/facet/facet_distribution.rs
@ -219,12 +219,19 @@ impl<'a> FacetDistribution<'a> {
                let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();

                let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
-                let original_string = self
-                    .index
-                    .field_id_docid_facet_strings
-                    .get(self.rtxn, &key)?
-                    .unwrap()
-                    .to_owned();
+                let optional_original_string =
+                    self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?;
+
+                let original_string = match optional_original_string {
+                    Some(original_string) => original_string.to_owned(),
+                    None => {
+                        tracing::error!(
+                            "Missing original facet string. Using the normalized facet {} instead",
+                            facet_key
+                        );
+                        facet_key.to_string()
+                    }
+                };

                distribution.insert(original_string, nbr_docids);
                if distribution.len() == self.max_values_per_facet {
--- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
@ -283,42 +283,60 @@ impl FacetedDocidsExtractor {
 }

 struct DelAddFacetValue<'doc> {
-    strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
+    strings: HashMap<
+        (FieldId, &'doc str),
+        Option<BVec<'doc, u8>>,
+        hashbrown::DefaultHashBuilder,
+        &'doc Bump,
+    >,
    f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
+    doc_alloc: &'doc Bump,
 }

 impl<'doc> DelAddFacetValue<'doc> {
    fn new(doc_alloc: &'doc Bump) -> Self {
-        Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) }
+        Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc }
    }

    fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
-        let cache = match kind {
-            FacetKind::String => &mut self.strings,
-            FacetKind::Number => &mut self.f64s,
-            _ => return,
-        };
-
-        let key = (fid, value);
-        if let Some(DelAdd::Deletion) = cache.get(&key) {
-            cache.remove(&key);
-        } else {
-            cache.insert(key, DelAdd::Addition);
+        match kind {
+            FacetKind::Number => {
+                let key = (fid, value);
+                if let Some(DelAdd::Deletion) = self.f64s.get(&key) {
+                    self.f64s.remove(&key);
+                } else {
+                    self.f64s.insert(key, DelAdd::Addition);
+                }
+            }
+            FacetKind::String => {
+                if let Ok(s) = std::str::from_utf8(&value) {
+                    let normalized = crate::normalize_facet(s);
+                    let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
+                    self.strings.insert((fid, truncated), Some(value));
+                }
+            }
+            _ => (),
        }
    }

    fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
-        let cache = match kind {
-            FacetKind::String => &mut self.strings,
-            FacetKind::Number => &mut self.f64s,
-            _ => return,
-        };
-
-        let key = (fid, value);
-        if let Some(DelAdd::Addition) = cache.get(&key) {
-            cache.remove(&key);
-        } else {
-            cache.insert(key, DelAdd::Deletion);
+        match kind {
+            FacetKind::Number => {
+                let key = (fid, value);
+                if let Some(DelAdd::Addition) = self.f64s.get(&key) {
+                    self.f64s.remove(&key);
+                } else {
+                    self.f64s.insert(key, DelAdd::Deletion);
+                }
+            }
+            FacetKind::String => {
+                if let Ok(s) = std::str::from_utf8(&value) {
+                    let normalized = crate::normalize_facet(s);
+                    let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized));
+                    self.strings.insert((fid, truncated), None);
+                }
+            }
+            _ => (),
        }
    }

@ -329,18 +347,14 @@ impl<'doc> DelAddFacetValue<'doc> {
        doc_alloc: &Bump,
    ) -> crate::Result<()> {
        let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
-        for ((fid, value), deladd) in self.strings {
-            if let Ok(s) = std::str::from_utf8(&value) {
-                buffer.clear();
-                buffer.extend_from_slice(&fid.to_be_bytes());
-                buffer.extend_from_slice(&docid.to_be_bytes());
-                let normalized = crate::normalize_facet(s);
-                let truncated = truncate_str(&normalized);
-                buffer.extend_from_slice(truncated.as_bytes());
-                match deladd {
-                    DelAdd::Deletion => sender.delete_facet_string(&buffer)?,
-                    DelAdd::Addition => sender.write_facet_string(&buffer, &value)?,
-                }
+        for ((fid, truncated), value) in self.strings {
+            buffer.clear();
+            buffer.extend_from_slice(&fid.to_be_bytes());
+            buffer.extend_from_slice(&docid.to_be_bytes());
+            buffer.extend_from_slice(truncated.as_bytes());
+            match &value {
+                Some(value) => sender.write_facet_string(&buffer, value)?,
+                None => sender.delete_facet_string(&buffer)?,
            }
        }

--- a/crates/milli/src/update/new/indexer/write.rs
+++ b/crates/milli/src/update/new/indexer/write.rs
@ -153,10 +153,12 @@ pub fn write_from_bbqueue(
                    }
                    (key, None) => match database.delete(wtxn, key) {
                        Ok(false) => {
-                            unreachable!(
-                                "We tried to delete an unknown key from {database_name}: {:?}",
-                                key.as_bstr()
-                            )
+                            tracing::error!(
+                                database_name,
+                                key_bytes = ?key,
+                                formatted_key = ?key.as_bstr(),
+                                "Attempt to delete an unknown key"
+                            );
                        }
                        Ok(_) => (),
                        Err(error) => {