Normalize for the search the facets values

2025-06-18 12:47:35 +02:00 · 2023-07-20 17:57:07 +02:00 · 2023-07-20 17:57:07 +02:00 · df528b41d8
commit df528b41d8
parent 3070a20580
12 changed files with 178 additions and 46 deletions
--- a/milli/src/heed_codec/beu16_str_codec.rs
+++ b/milli/src/heed_codec/beu16_str_codec.rs
@ -0,0 +1,27 @@
+use std::borrow::Cow;
+use std::convert::TryInto;
+use std::str;
+
+pub struct BEU16StrCodec;
+
+impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
+    type DItem = (u16, &'a str);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (n_bytes, str_bytes) = bytes.split_at(2);
+        let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
+        let s = str::from_utf8(str_bytes).ok()?;
+        Some((n, s))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
+    type EItem = (u16, &'a str);
+
+    fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
+        let mut bytes = Vec::with_capacity(s.len() + 2);
+        bytes.extend_from_slice(&n.to_be_bytes());
+        bytes.extend_from_slice(s.as_bytes());
+        Some(Cow::Owned(bytes))
+    }
+}
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -1,3 +1,4 @@
+mod beu16_str_codec;
 mod beu32_str_codec;
 mod byte_slice_ref;
 pub mod facet;
@ -14,6 +15,7 @@ mod str_str_u8_codec;
 pub use byte_slice_ref::ByteSliceRefCodec;
 pub use str_ref::StrRefCodec;

+pub use self::beu16_str_codec::BEU16StrCodec;
 pub use self::beu32_str_codec::BEU32StrCodec;
 pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
 pub use self::fst_set_codec::FstSetCodec;
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
@ -21,7 +21,9 @@ use crate::heed_codec::facet::{
    FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
    FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
+use crate::heed_codec::{
+    BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
+};
 use crate::readable_slices::ReadableSlices;
 use crate::{
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -96,6 +98,7 @@ pub mod db_name {
    pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
    pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
    pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
+    pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings";
    pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
    pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
    pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -157,6 +160,8 @@ pub struct Index {
    pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
    /// Maps the facet field id and ranges of strings with the docids that corresponds to them.
    pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
+    /// Maps the facet field id of the normalized-for-search string facets with their original versions.
+    pub facet_id_normalized_string_strings: Database<BEU16StrCodec, SerdeJson<BTreeSet<String>>>,
    /// Maps the facet field id of the string facets with an FST containing all the facets values.
    pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,

@ -181,7 +186,7 @@ impl Index {
    ) -> Result<Index> {
        use db_name::*;

-        options.max_dbs(24);
+        options.max_dbs(25);
        unsafe { options.flag(Flags::MdbAlwaysFreePages) };

        let env = options.open(path)?;
@ -211,6 +216,8 @@ impl Index {
        let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
        let facet_id_string_docids =
            env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
+        let facet_id_normalized_string_strings =
+            env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?;
        let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
        let facet_id_exists_docids =
            env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
@ -246,6 +253,7 @@ impl Index {
            field_id_word_count_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
+            facet_id_normalized_string_strings,
            facet_id_string_fst,
            facet_id_exists_docids,
            facet_id_is_null_docids,
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -51,9 +51,10 @@ pub use self::error::{
 pub use self::external_documents_ids::ExternalDocumentsIds;
 pub use self::fields_ids_map::FieldsIdsMap;
 pub use self::heed_codec::{
-    BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
-    CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
-    RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec,
+    BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
+    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
+    RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
+    UncheckedU8StrStrCodec,
 };
 pub use self::index::Index;
 pub use self::search::{
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -339,9 +339,25 @@ impl<'a> SearchForFacetValues<'a> {

                        let mut stream = fst.search(automaton).into_stream();
                        let mut length = 0;
-                        while let Some(facet_value) = stream.next() {
+                        'outer: while let Some(facet_value) = stream.next() {
                            let value = std::str::from_utf8(facet_value)?;
-                            let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
+                            let database = index.facet_id_normalized_string_strings;
+                            let key = (fid, value);
+                            let original_strings = match database.get(rtxn, &key)? {
+                                Some(original_strings) => original_strings,
+                                None => {
+                                    error!(
+                                        "the facet value is missing from the facet database: {key:?}"
+                                    );
+                                    continue;
+                                }
+                            };
+                            for original_string in original_strings {
+                                let key = FacetGroupKey {
+                                    field_id: fid,
+                                    level: 0,
+                                    left_bound: original_string.as_str(),
+                                };
                                let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
                                    Some(FacetGroupValue { bitmap, .. }) => bitmap,
                                    None => {
@ -354,13 +370,18 @@ impl<'a> SearchForFacetValues<'a> {
                                let count = search_candidates.intersection_len(&docids);
                                if count != 0 {
                                    let value = self
-                                    .one_original_value_of(fid, value, docids.min().unwrap())?
+                                        .one_original_value_of(
+                                            fid,
+                                            &original_string,
+                                            docids.min().unwrap(),
+                                        )?
                                        .unwrap_or_else(|| query.to_string());
                                    results.push(FacetValueHit { value, count });
                                    length += 1;
                                }
                                if length >= MAX_NUMBER_OF_FACETS {
-                                break;
+                                    break 'outer;
+                                }
                            }
                        }
                    }
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            script_language_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
+            facet_id_normalized_string_strings,
            facet_id_string_fst,
            facet_id_exists_docids,
            facet_id_is_null_docids,
@ -92,6 +93,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_prefix_fid_docids.clear(self.wtxn)?;
        script_language_docids.clear(self.wtxn)?;
        facet_id_f64_docids.clear(self.wtxn)?;
+        facet_id_normalized_string_strings.clear(self.wtxn)?;
        facet_id_string_fst.clear(self.wtxn)?;
        facet_id_exists_docids.clear(self.wtxn)?;
        facet_id_is_null_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            word_prefix_fid_docids,
            facet_id_f64_docids: _,
            facet_id_string_docids: _,
+            facet_id_normalized_string_strings: _,
            facet_id_string_fst: _,
            field_id_docid_facet_f64s: _,
            field_id_docid_facet_strings: _,
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
 pub const FACET_GROUP_SIZE: u8 = 4;
 pub const FACET_MIN_LEVEL_SIZE: u8 = 5;

+use std::collections::BTreeSet;
 use std::fs::File;
+use std::iter::FromIterator;

-use heed::types::DecodeIgnore;
+use charabia::normalizer::{Normalize, NormalizerOption};
+use grenad::{CompressionType, SortAlgorithm};
+use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
+use heed::BytesEncode;
 use log::debug;
 use time::OffsetDateTime;

@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::heed_codec::ByteSliceRefCodec;
-use crate::{Index, Result, BEU16};
+use crate::update::index_documents::create_sorter;
+use crate::update::merge_btreeset_string;
+use crate::{BEU16StrCodec, Index, Result, BEU16};

 pub mod bulk;
 pub mod delete;
@ -159,13 +166,57 @@ impl<'i> FacetsUpdate<'i> {
            incremental_update.execute(wtxn)?;
        }

+        // We clear the list of normalized-for-search facets
+        // and the previous FSTs to compute everything from scratch
+        self.index.facet_id_normalized_string_strings.clear(wtxn)?;
+        self.index.facet_id_string_fst.clear(wtxn)?;
+
+        // As we can't use the same write transaction to read and write in two different databases
+        // we must create a temporary sorter that we will write into LMDB afterward.
+        // As multiple unnormalized facet values can become the same normalized facet value
+        // we must merge them together.
+        let mut sorter = create_sorter(
+            SortAlgorithm::Unstable,
+            merge_btreeset_string,
+            CompressionType::None,
+            None,
+            None,
+            None,
+        );
+
+        // We iterate on the list of original, semi-normalized, facet values
+        // and normalize them for search, inserting them in LMDB in any given order.
+        let options = NormalizerOption { lossy: true, ..Default::default() };
+        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        for result in database.iter(wtxn)? {
+            let (facet_group_key, ()) = result?;
+            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
+                let normalized_facet = left_bound.normalize(&options);
+                let set = BTreeSet::from_iter(std::iter::once(left_bound));
+                let key = (field_id, normalized_facet.as_ref());
+                let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
+                let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
+                sorter.insert(key, val)?;
+            }
+        }
+
+        // In this loop we don't need to take care of merging bitmaps
+        // as the grenad sorter already merged them for us.
+        let mut merger_iter = sorter.into_stream_merger_iter()?;
+        while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
+            self.index
+                .facet_id_normalized_string_strings
+                .remap_types::<ByteSlice, ByteSlice>()
+                .put(wtxn, key_bytes, btreeset_bytes)?;
+        }
+
        // We compute one FST by string facet
        let mut text_fsts = vec![];
        let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
-        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        let database =
+            self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
        for result in database.iter(wtxn)? {
-            let (facet_group_key, _) = result?;
-            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
+            let ((field_id, normalized_facet), _) = result?;
            current_fst = match current_fst.take() {
                Some((fid, fst_builder)) if fid != field_id => {
                    let fst = fst_builder.into_set();
@ -177,8 +228,7 @@ impl<'i> FacetsUpdate<'i> {
            };

            if let Some((_, fst_builder)) = current_fst.as_mut() {
-                    fst_builder.insert(left_bound)?;
-                }
+                fst_builder.insert(normalized_facet)?;
            }
        }

@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
            text_fsts.push((field_id, fst));
        }

-        // We remove all of the previous FSTs that were in this database
-        self.index.facet_id_string_fst.clear(wtxn)?;
-
        // We write those FSTs in LMDB now
        for (field_id, fst) in text_fsts {
            self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::BTreeSet;
 use std::io;
 use std::result::Result as StdResult;

@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
    }
 }

+pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+    if values.len() == 1 {
+        Ok(values[0].clone())
+    } else {
+        // TODO improve the perf by using a `#[borrow] Cow<str>`.
+        let strings: BTreeSet<String> = values
+            .iter()
+            .map(AsRef::as_ref)
+            .map(serde_json::from_slice::<BTreeSet<String>>)
+            .map(StdResult::unwrap)
+            .reduce(|mut current, new| {
+                for x in new {
+                    current.insert(x);
+                }
+                current
+            })
+            .unwrap();
+        Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
+    }
+}
+
 pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
    Ok(values[0].clone())
 }
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@ -13,9 +13,9 @@ pub use grenad_helpers::{
    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
-    concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
-    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
-    MergeFn,
+    concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
+    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
+    serialize_roaring_bitmap, MergeFn,
 };

 use crate::MAX_WORD_LENGTH;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -26,7 +26,7 @@ pub use self::enrich::{
 };
 pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
    sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
 pub use self::index_documents::{
-    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
-    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
+    merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
+    MergeFn,
 };
 pub use self::indexer_config::IndexerConfig;
 pub use self::prefix_word_pairs::{