Merge #3942

3942: Normalize for the search the facets values r=ManyTheFish a=Kerollmops This PR improves and fixes the search for facet values feature. Searching for _bre_ wasn't returning facet values like _brévent_ or _brô_. The issue was related to the fact that facets are normalized but not in the same way as the `searchableAttributes` are. We decided to normalize them further and add another intermediate database where the key is the normalized facet value, and the value is a set of the non-normalized facets. We then use these non-normalized ones to get the correct counts by fetching the associated databases. ### What's missing in this PR? - [x] Apply the change to the whole set of `SearchForFacetValue::execute` conditions. - [x] Factorize the code that does an intermediate normalized value fetch in a function. - [x] Add or modify the search for facet value test. Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com>
2025-07-02 03:18:30 +02:00 · 2023-07-25 14:37:17 +00:00 · 2023-07-25 14:37:17 +00:00 · be72be7c0d
commit be72be7c0d
parent 9e3e69373e 59201a7852
13 changed files with 279 additions and 94 deletions
--- a/meilisearch/tests/search/facet_search.rs
+++ b/meilisearch/tests/search/facet_search.rs
@ -1,3 +1,4 @@
+use meili_snap::snapshot;
 use once_cell::sync::Lazy;
 use serde_json::{json, Value};

@ -56,6 +57,54 @@ async fn simple_facet_search() {
    assert_eq!(response["facetHits"].as_array().unwrap().len(), 1);
 }

+#[actix_rt::test]
+async fn advanced_facet_search() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let documents = DOCUMENTS.clone();
+    index.update_settings_filterable_attributes(json!(["genres"])).await;
+    index.update_settings_typo_tolerance(json!({ "enabled": false })).await;
+    index.add_documents(documents, None).await;
+    index.wait_task(2).await;
+
+    let (response, code) =
+        index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await;
+
+    snapshot!(code, @"200 OK");
+    snapshot!(response["facetHits"].as_array().unwrap().len(), @"0");
+
+    let (response, code) =
+        index.facet_search(json!({"facetName": "genres", "facetQuery": "àdventure"})).await;
+
+    snapshot!(code, @"200 OK");
+    snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
+}
+
+#[actix_rt::test]
+async fn more_advanced_facet_search() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let documents = DOCUMENTS.clone();
+    index.update_settings_filterable_attributes(json!(["genres"])).await;
+    index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await;
+    index.add_documents(documents, None).await;
+    index.wait_task(2).await;
+
+    let (response, code) =
+        index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await;
+
+    snapshot!(code, @"200 OK");
+    snapshot!(response["facetHits"].as_array().unwrap().len(), @"0");
+
+    let (response, code) =
+        index.facet_search(json!({"facetName": "genres", "facetQuery": "adventure"})).await;
+
+    snapshot!(code, @"200 OK");
+    snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
+}
+
 #[actix_rt::test]
 async fn non_filterable_facet_search_error() {
    let server = Server::new().await;
--- a/milli/src/heed_codec/beu16_str_codec.rs
+++ b/milli/src/heed_codec/beu16_str_codec.rs
@ -0,0 +1,27 @@
+use std::borrow::Cow;
+use std::convert::TryInto;
+use std::str;
+
+pub struct BEU16StrCodec;
+
+impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
+    type DItem = (u16, &'a str);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (n_bytes, str_bytes) = bytes.split_at(2);
+        let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
+        let s = str::from_utf8(str_bytes).ok()?;
+        Some((n, s))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
+    type EItem = (u16, &'a str);
+
+    fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
+        let mut bytes = Vec::with_capacity(s.len() + 2);
+        bytes.extend_from_slice(&n.to_be_bytes());
+        bytes.extend_from_slice(s.as_bytes());
+        Some(Cow::Owned(bytes))
+    }
+}
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -1,3 +1,4 @@
+mod beu16_str_codec;
 mod beu32_str_codec;
 mod byte_slice_ref;
 pub mod facet;
@ -14,6 +15,7 @@ mod str_str_u8_codec;
 pub use byte_slice_ref::ByteSliceRefCodec;
 pub use str_ref::StrRefCodec;

+pub use self::beu16_str_codec::BEU16StrCodec;
 pub use self::beu32_str_codec::BEU32StrCodec;
 pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
 pub use self::fst_set_codec::FstSetCodec;
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
@ -20,7 +20,9 @@ use crate::heed_codec::facet::{
    FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
    FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
+use crate::heed_codec::{
+    BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
+};
 use crate::readable_slices::ReadableSlices;
 use crate::{
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -95,6 +97,7 @@ pub mod db_name {
    pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
    pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
    pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
+    pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings";
    pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
    pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
    pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -156,6 +159,8 @@ pub struct Index {
    pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
    /// Maps the facet field id and ranges of strings with the docids that corresponds to them.
    pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
+    /// Maps the facet field id of the normalized-for-search string facets with their original versions.
+    pub facet_id_normalized_string_strings: Database<BEU16StrCodec, SerdeJson<BTreeSet<String>>>,
    /// Maps the facet field id of the string facets with an FST containing all the facets values.
    pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,

@ -180,7 +185,7 @@ impl Index {
    ) -> Result<Index> {
        use db_name::*;

-        options.max_dbs(24);
+        options.max_dbs(25);
        unsafe { options.flag(Flags::MdbAlwaysFreePages) };

        let env = options.open(path)?;
@ -210,6 +215,8 @@ impl Index {
        let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
        let facet_id_string_docids =
            env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
+        let facet_id_normalized_string_strings =
+            env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?;
        let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
        let facet_id_exists_docids =
            env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
@ -245,6 +252,7 @@ impl Index {
            field_id_word_count_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
+            facet_id_normalized_string_strings,
            facet_id_string_fst,
            facet_id_exists_docids,
            facet_id_is_null_docids,
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -51,9 +51,10 @@ pub use self::error::{
 pub use self::external_documents_ids::ExternalDocumentsIds;
 pub use self::fields_ids_map::FieldsIdsMap;
 pub use self::heed_codec::{
-    BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
-    CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
-    RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec,
+    BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
+    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
+    RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
+    UncheckedU8StrStrCodec,
 };
 pub use self::index::Index;
 pub use self::search::{
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -1,5 +1,8 @@
 use std::fmt;
+use std::ops::ControlFlow;

+use charabia::normalizer::NormalizerOption;
+use charabia::Normalize;
 use fst::automaton::{Automaton, Str};
 use fst::{IntoStreamer, Streamer};
 use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
@ -14,8 +17,8 @@ use crate::error::UserError;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
 use crate::score_details::{ScoreDetails, ScoringStrategy};
 use crate::{
-    execute_search, normalize_facet, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index,
-    Result, SearchContext, BEU16,
+    execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result,
+    SearchContext, BEU16,
 };

 // Building these factories is not free.
@ -301,29 +304,28 @@ impl<'a> SearchForFacetValues<'a> {

        match self.query.as_ref() {
            Some(query) => {
-                let query = normalize_facet(query);
-                let query = query.as_str();
+                let options = NormalizerOption { lossy: true, ..Default::default() };
+                let query = query.normalize(&options);
+                let query = query.as_ref();
+
                let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
                let field_authorizes_typos =
                    !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);

                if authorize_typos && field_authorizes_typos {
-                    let mut results = vec![];
-
                    let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
                    if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
-                        let key = FacetGroupKey { field_id: fid, level: 0, left_bound: query };
-                        if let Some(FacetGroupValue { bitmap, .. }) =
-                            index.facet_id_string_docids.get(rtxn, &key)?
-                        {
-                            let count = search_candidates.intersection_len(&bitmap);
-                            if count != 0 {
-                                let value = self
-                                    .one_original_value_of(fid, query, bitmap.min().unwrap())?
-                                    .unwrap_or_else(|| query.to_string());
-                                results.push(FacetValueHit { value, count });
-                            }
+                        let mut results = vec![];
+                        if fst.contains(query) {
+                            self.fetch_original_facets_using_normalized(
+                                fid,
+                                query,
+                                query,
+                                &search_candidates,
+                                &mut results,
+                            )?;
                        }
+                        Ok(results)
                    } else {
                        let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
                        let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
@ -338,60 +340,41 @@ impl<'a> SearchForFacetValues<'a> {
                        };

                        let mut stream = fst.search(automaton).into_stream();
-                        let mut length = 0;
+                        let mut results = vec![];
                        while let Some(facet_value) = stream.next() {
                            let value = std::str::from_utf8(facet_value)?;
-                            let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
-                            let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
-                                Some(FacetGroupValue { bitmap, .. }) => bitmap,
-                                None => {
-                                    error!(
-                                        "the facet value is missing from the facet database: {key:?}"
-                                    );
-                                    continue;
-                                }
-                            };
-                            let count = search_candidates.intersection_len(&docids);
-                            if count != 0 {
-                                let value = self
-                                    .one_original_value_of(fid, value, docids.min().unwrap())?
-                                    .unwrap_or_else(|| query.to_string());
-                                results.push(FacetValueHit { value, count });
-                                length += 1;
-                            }
-                            if length >= MAX_NUMBER_OF_FACETS {
+                            if self
+                                .fetch_original_facets_using_normalized(
+                                    fid,
+                                    value,
+                                    query,
+                                    &search_candidates,
+                                    &mut results,
+                                )?
+                                .is_break()
+                            {
                                break;
                            }
                        }
-                    }

-                    Ok(results)
+                        Ok(results)
+                    }
                } else {
                    let automaton = Str::new(query).starts_with();
                    let mut stream = fst.search(automaton).into_stream();
                    let mut results = vec![];
-                    let mut length = 0;
                    while let Some(facet_value) = stream.next() {
                        let value = std::str::from_utf8(facet_value)?;
-                        let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
-                        let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
-                            Some(FacetGroupValue { bitmap, .. }) => bitmap,
-                            None => {
-                                error!(
-                                    "the facet value is missing from the facet database: {key:?}"
-                                );
-                                continue;
-                            }
-                        };
-                        let count = search_candidates.intersection_len(&docids);
-                        if count != 0 {
-                            let value = self
-                                .one_original_value_of(fid, value, docids.min().unwrap())?
-                                .unwrap_or_else(|| query.to_string());
-                            results.push(FacetValueHit { value, count });
-                            length += 1;
-                        }
-                        if length >= MAX_NUMBER_OF_FACETS {
+                        if self
+                            .fetch_original_facets_using_normalized(
+                                fid,
+                                value,
+                                query,
+                                &search_candidates,
+                                &mut results,
+                            )?
+                            .is_break()
+                        {
                            break;
                        }
                    }
@ -401,7 +384,6 @@ impl<'a> SearchForFacetValues<'a> {
            }
            None => {
                let mut results = vec![];
-                let mut length = 0;
                let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
                for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
                    let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
@ -412,9 +394,8 @@ impl<'a> SearchForFacetValues<'a> {
                            .one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
                            .unwrap_or_else(|| left_bound.to_string());
                        results.push(FacetValueHit { value, count });
-                        length += 1;
                    }
-                    if length >= MAX_NUMBER_OF_FACETS {
+                    if results.len() >= MAX_NUMBER_OF_FACETS {
                        break;
                    }
                }
@ -422,6 +403,50 @@ impl<'a> SearchForFacetValues<'a> {
            }
        }
    }
+
+    fn fetch_original_facets_using_normalized(
+        &self,
+        fid: FieldId,
+        value: &str,
+        query: &str,
+        search_candidates: &RoaringBitmap,
+        results: &mut Vec<FacetValueHit>,
+    ) -> Result<ControlFlow<()>> {
+        let index = self.search_query.index;
+        let rtxn = self.search_query.rtxn;
+
+        let database = index.facet_id_normalized_string_strings;
+        let key = (fid, value);
+        let original_strings = match database.get(rtxn, &key)? {
+            Some(original_strings) => original_strings,
+            None => {
+                error!("the facet value is missing from the facet database: {key:?}");
+                return Ok(ControlFlow::Continue(()));
+            }
+        };
+        for original in original_strings {
+            let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
+            let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
+                Some(FacetGroupValue { bitmap, .. }) => bitmap,
+                None => {
+                    error!("the facet value is missing from the facet database: {key:?}");
+                    return Ok(ControlFlow::Continue(()));
+                }
+            };
+            let count = search_candidates.intersection_len(&docids);
+            if count != 0 {
+                let value = self
+                    .one_original_value_of(fid, &original, docids.min().unwrap())?
+                    .unwrap_or_else(|| query.to_string());
+                results.push(FacetValueHit { value, count });
+            }
+            if results.len() >= MAX_NUMBER_OF_FACETS {
+                return Ok(ControlFlow::Break(()));
+            }
+        }
+
+        Ok(ControlFlow::Continue(()))
+    }
 }

 #[derive(Debug, Clone, serde::Serialize, PartialEq)]
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            script_language_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
+            facet_id_normalized_string_strings,
            facet_id_string_fst,
            facet_id_exists_docids,
            facet_id_is_null_docids,
@ -92,6 +93,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_prefix_fid_docids.clear(self.wtxn)?;
        script_language_docids.clear(self.wtxn)?;
        facet_id_f64_docids.clear(self.wtxn)?;
+        facet_id_normalized_string_strings.clear(self.wtxn)?;
        facet_id_string_fst.clear(self.wtxn)?;
        facet_id_exists_docids.clear(self.wtxn)?;
        facet_id_is_null_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -236,6 +236,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            word_prefix_fid_docids,
            facet_id_f64_docids: _,
            facet_id_string_docids: _,
+            facet_id_normalized_string_strings: _,
            facet_id_string_fst: _,
            field_id_docid_facet_f64s: _,
            field_id_docid_facet_strings: _,
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
 pub const FACET_GROUP_SIZE: u8 = 4;
 pub const FACET_MIN_LEVEL_SIZE: u8 = 5;

+use std::collections::BTreeSet;
 use std::fs::File;
+use std::iter::FromIterator;

-use heed::types::DecodeIgnore;
+use charabia::normalizer::{Normalize, NormalizerOption};
+use grenad::{CompressionType, SortAlgorithm};
+use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
+use heed::BytesEncode;
 use log::debug;
 use time::OffsetDateTime;

@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::heed_codec::ByteSliceRefCodec;
-use crate::{Index, Result, BEU16};
+use crate::update::index_documents::create_sorter;
+use crate::update::merge_btreeset_string;
+use crate::{BEU16StrCodec, Index, Result, BEU16};

 pub mod bulk;
 pub mod delete;
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
            incremental_update.execute(wtxn)?;
        }

+        // We clear the list of normalized-for-search facets
+        // and the previous FSTs to compute everything from scratch
+        self.index.facet_id_normalized_string_strings.clear(wtxn)?;
+        self.index.facet_id_string_fst.clear(wtxn)?;
+
+        // As we can't use the same write transaction to read and write in two different databases
+        // we must create a temporary sorter that we will write into LMDB afterward.
+        // As multiple unnormalized facet values can become the same normalized facet value
+        // we must merge them together.
+        let mut sorter = create_sorter(
+            SortAlgorithm::Unstable,
+            merge_btreeset_string,
+            CompressionType::None,
+            None,
+            None,
+            None,
+        );
+
+        // We iterate on the list of original, semi-normalized, facet values
+        // and normalize them for search, inserting them in LMDB in any given order.
+        let options = NormalizerOption { lossy: true, ..Default::default() };
+        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        for result in database.iter(wtxn)? {
+            let (facet_group_key, ()) = result?;
+            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
+                let normalized_facet = left_bound.normalize(&options);
+                let set = BTreeSet::from_iter(std::iter::once(left_bound));
+                let key = (field_id, normalized_facet.as_ref());
+                let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
+                let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
+                sorter.insert(key, val)?;
+            }
+        }
+
+        // In this loop we don't need to take care of merging bitmaps
+        // as the grenad sorter already merged them for us.
+        let mut merger_iter = sorter.into_stream_merger_iter()?;
+        while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
+            self.index
+                .facet_id_normalized_string_strings
+                .remap_types::<ByteSlice, ByteSlice>()
+                .put(wtxn, key_bytes, btreeset_bytes)?;
+        }
+
        // We compute one FST by string facet
        let mut text_fsts = vec![];
        let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
-        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        let database =
+            self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
        for result in database.iter(wtxn)? {
-            let (facet_group_key, _) = result?;
-            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
-                current_fst = match current_fst.take() {
-                    Some((fid, fst_builder)) if fid != field_id => {
-                        let fst = fst_builder.into_set();
-                        text_fsts.push((fid, fst));
-                        Some((field_id, fst::SetBuilder::memory()))
-                    }
-                    Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
-                    None => Some((field_id, fst::SetBuilder::memory())),
-                };
-
-                if let Some((_, fst_builder)) = current_fst.as_mut() {
-                    fst_builder.insert(left_bound)?;
+            let ((field_id, normalized_facet), _) = result?;
+            current_fst = match current_fst.take() {
+                Some((fid, fst_builder)) if fid != field_id => {
+                    let fst = fst_builder.into_set();
+                    text_fsts.push((fid, fst));
+                    Some((field_id, fst::SetBuilder::memory()))
                }
+                Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
+                None => Some((field_id, fst::SetBuilder::memory())),
+            };
+
+            if let Some((_, fst_builder)) = current_fst.as_mut() {
+                fst_builder.insert(normalized_facet)?;
            }
        }

@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
            text_fsts.push((field_id, fst));
        }

-        // We remove all of the previous FSTs that were in this database
-        self.index.facet_id_string_fst.clear(wtxn)?;
-
        // We write those FSTs in LMDB now
        for (field_id, fst) in text_fsts {
            self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::BTreeSet;
 use std::io;
 use std::result::Result as StdResult;

@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
    }
 }

+pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+    if values.len() == 1 {
+        Ok(values[0].clone())
+    } else {
+        // TODO improve the perf by using a `#[borrow] Cow<str>`.
+        let strings: BTreeSet<String> = values
+            .iter()
+            .map(AsRef::as_ref)
+            .map(serde_json::from_slice::<BTreeSet<String>>)
+            .map(StdResult::unwrap)
+            .reduce(|mut current, new| {
+                for x in new {
+                    current.insert(x);
+                }
+                current
+            })
+            .unwrap();
+        Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
+    }
+}
+
 pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
    Ok(values[0].clone())
 }
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@ -13,9 +13,9 @@ pub use grenad_helpers::{
    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
-    concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
-    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
-    MergeFn,
+    concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
+    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
+    serialize_roaring_bitmap, MergeFn,
 };

 use crate::MAX_WORD_LENGTH;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -26,7 +26,7 @@ pub use self::enrich::{
 };
 pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
    sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
 pub use self::index_documents::{
-    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
-    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
+    merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
+    MergeFn,
 };
 pub use self::indexer_config::IndexerConfig;
 pub use self::prefix_word_pairs::{