Merge branch 'main' into settings-customizing-tokenization

2025-07-04 20:37:15 +02:00 · 2023-08-08 16:08:16 +02:00 · 2023-08-08 16:08:16 +02:00 · 4a21fecf67
commit 4a21fecf67
parent ae8e69c030 3dda93d50f
166 changed files with 2252 additions and 1072 deletions
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -36,6 +36,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            script_language_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
+            facet_id_normalized_string_strings,
            facet_id_string_fst,
            facet_id_exists_docids,
            facet_id_is_null_docids,
@ -94,6 +95,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_prefix_fid_docids.clear(self.wtxn)?;
        script_language_docids.clear(self.wtxn)?;
        facet_id_f64_docids.clear(self.wtxn)?;
+        facet_id_normalized_string_strings.clear(self.wtxn)?;
        facet_id_string_fst.clear(self.wtxn)?;
        facet_id_exists_docids.clear(self.wtxn)?;
        facet_id_is_null_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -4,10 +4,9 @@ use std::collections::{BTreeSet, HashMap, HashSet};
 use fst::IntoStreamer;
 use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
 use heed::{BytesDecode, BytesEncode, Database, RwIter};
-use hnsw::Searcher;
+use instant_distance::PointId;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
-use space::KnnPoints;
 use time::OffsetDateTime;

 use super::facet::delete::FacetsDelete;
@ -239,6 +238,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            word_prefix_fid_docids,
            facet_id_f64_docids: _,
            facet_id_string_docids: _,
+            facet_id_normalized_string_strings: _,
            facet_id_string_fst: _,
            field_id_docid_facet_f64s: _,
            field_id_docid_facet_strings: _,
@ -438,24 +438,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {

        // An ugly and slow way to remove the vectors from the HNSW
        // It basically reconstructs the HNSW from scratch without editing the current one.
-        let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
-        if !current_hnsw.is_empty() {
-            let mut new_hnsw = Hnsw::default();
-            let mut searcher = Searcher::new();
-            let mut new_vector_id_docids = Vec::new();
-
+        if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? {
+            let mut points = Vec::new();
+            let mut docids = Vec::new();
            for result in vector_id_docid.iter(self.wtxn)? {
                let (vector_id, docid) = result?;
                if !self.to_delete_docids.contains(docid.get()) {
-                    let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
-                    let vector_id = new_hnsw.insert(vector, &mut searcher);
-                    new_vector_id_docids.push((vector_id as u32, docid));
+                    let pid = PointId::from(vector_id.get());
+                    let vector = current_hnsw[pid].clone();
+                    points.push(vector);
+                    docids.push(docid);
                }
            }

+            let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
+
            vector_id_docid.clear(self.wtxn)?;
-            for (vector_id, docid) in new_vector_id_docids {
-                vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
+            for (pid, docid) in pids.into_iter().zip(docids) {
+                vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?;
            }
            self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
        }
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
 pub const FACET_GROUP_SIZE: u8 = 4;
 pub const FACET_MIN_LEVEL_SIZE: u8 = 5;

+use std::collections::BTreeSet;
 use std::fs::File;
+use std::iter::FromIterator;

-use heed::types::DecodeIgnore;
+use charabia::normalizer::{Normalize, NormalizerOption};
+use grenad::{CompressionType, SortAlgorithm};
+use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
+use heed::BytesEncode;
 use log::debug;
 use time::OffsetDateTime;

@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::heed_codec::ByteSliceRefCodec;
-use crate::{Index, Result, BEU16};
+use crate::update::index_documents::create_sorter;
+use crate::update::merge_btreeset_string;
+use crate::{BEU16StrCodec, Index, Result, BEU16};

 pub mod bulk;
 pub mod delete;
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
            incremental_update.execute(wtxn)?;
        }

+        // We clear the list of normalized-for-search facets
+        // and the previous FSTs to compute everything from scratch
+        self.index.facet_id_normalized_string_strings.clear(wtxn)?;
+        self.index.facet_id_string_fst.clear(wtxn)?;
+
+        // As we can't use the same write transaction to read and write in two different databases
+        // we must create a temporary sorter that we will write into LMDB afterward.
+        // As multiple unnormalized facet values can become the same normalized facet value
+        // we must merge them together.
+        let mut sorter = create_sorter(
+            SortAlgorithm::Unstable,
+            merge_btreeset_string,
+            CompressionType::None,
+            None,
+            None,
+            None,
+        );
+
+        // We iterate on the list of original, semi-normalized, facet values
+        // and normalize them for search, inserting them in LMDB in any given order.
+        let options = NormalizerOption { lossy: true, ..Default::default() };
+        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        for result in database.iter(wtxn)? {
+            let (facet_group_key, ()) = result?;
+            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
+                let normalized_facet = left_bound.normalize(&options);
+                let set = BTreeSet::from_iter(std::iter::once(left_bound));
+                let key = (field_id, normalized_facet.as_ref());
+                let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
+                let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
+                sorter.insert(key, val)?;
+            }
+        }
+
+        // In this loop we don't need to take care of merging bitmaps
+        // as the grenad sorter already merged them for us.
+        let mut merger_iter = sorter.into_stream_merger_iter()?;
+        while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
+            self.index
+                .facet_id_normalized_string_strings
+                .remap_types::<ByteSlice, ByteSlice>()
+                .put(wtxn, key_bytes, btreeset_bytes)?;
+        }
+
        // We compute one FST by string facet
        let mut text_fsts = vec![];
        let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
-        let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
+        let database =
+            self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
        for result in database.iter(wtxn)? {
-            let (facet_group_key, _) = result?;
-            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
-                current_fst = match current_fst.take() {
-                    Some((fid, fst_builder)) if fid != field_id => {
-                        let fst = fst_builder.into_set();
-                        text_fsts.push((fid, fst));
-                        Some((field_id, fst::SetBuilder::memory()))
-                    }
-                    Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
-                    None => Some((field_id, fst::SetBuilder::memory())),
-                };
-
-                if let Some((_, fst_builder)) = current_fst.as_mut() {
-                    fst_builder.insert(left_bound)?;
+            let ((field_id, normalized_facet), _) = result?;
+            current_fst = match current_fst.take() {
+                Some((fid, fst_builder)) if fid != field_id => {
+                    let fst = fst_builder.into_set();
+                    text_fsts.push((fid, fst));
+                    Some((field_id, fst::SetBuilder::memory()))
                }
+                Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
+                None => Some((field_id, fst::SetBuilder::memory())),
+            };
+
+            if let Some((_, fst_builder)) = current_fst.as_mut() {
+                fst_builder.insert(normalized_facet)?;
            }
        }

@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
            text_fsts.push((field_id, fst));
        }

-        // We remove all of the previous FSTs that were in this database
-        self.index.facet_id_string_fst.clear(wtxn)?;
-
        // We write those FSTs in LMDB now
        for (field_id, fst) in text_fsts {
            self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::BTreeSet;
 use std::io;
 use std::result::Result as StdResult;

@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
    }
 }

+pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+    if values.len() == 1 {
+        Ok(values[0].clone())
+    } else {
+        // TODO improve the perf by using a `#[borrow] Cow<str>`.
+        let strings: BTreeSet<String> = values
+            .iter()
+            .map(AsRef::as_ref)
+            .map(serde_json::from_slice::<BTreeSet<String>>)
+            .map(StdResult::unwrap)
+            .reduce(|mut current, new| {
+                for x in new {
+                    current.insert(x);
+                }
+                current
+            })
+            .unwrap();
+        Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
+    }
+}
+
 pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
    Ok(values[0].clone())
 }
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@ -13,9 +13,9 @@ pub use grenad_helpers::{
    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
-    concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
-    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
-    MergeFn,
+    concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
+    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
+    serialize_roaring_bitmap, MergeFn,
 };

 use crate::MAX_WORD_LENGTH;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -26,7 +26,7 @@ pub use self::enrich::{
 };
 pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
    sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -9,22 +9,19 @@ use charabia::{Language, Script};
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::RwTxn;
-use hnsw::Searcher;
 use roaring::RoaringBitmap;
-use space::KnnPoints;

 use super::helpers::{
    self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
 };
 use super::{ClonableMmap, MergeFn};
+use crate::distance::NDotProductPoint;
 use crate::error::UserError;
 use crate::facet::FacetType;
+use crate::index::Hnsw;
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
-use crate::{
-    lat_lng_to_xyz, normalize_vector, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result,
-    BEU32,
-};
+use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};

 pub(crate) enum TypedChunk {
    FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -292,17 +289,20 @@ pub(crate) fn write_typed_chunk_into_index(
            index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
        }
        TypedChunk::VectorPoints(vector_points) => {
-            let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
-            let mut searcher = Searcher::new();
-
-            let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
-                Some(result) => {
-                    let (vector_id, _) = result?;
-                    Some(hnsw.get_point(vector_id.get() as usize).len())
-                }
-                None => None,
+            let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
+                Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
+                None => Default::default(),
            };

+            // Convert the PointIds into DocumentIds
+            let mut docids = Vec::new();
+            for pid in pids {
+                let docid =
+                    index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
+                docids.push(docid.get());
+            }
+
+            let mut expected_dimensions = points.get(0).map(|p| p.len());
            let mut cursor = vector_points.into_cursor()?;
            while let Some((key, value)) = cursor.move_on_next()? {
                // convert the key back to a u32 (4 bytes)
@ -318,12 +318,26 @@ pub(crate) fn write_typed_chunk_into_index(
                    return Err(UserError::InvalidVectorDimensions { expected, found })?;
                }

-                let vector = normalize_vector(vector);
-                let vector_id = hnsw.insert(vector, &mut searcher) as u32;
-                index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
+                points.push(NDotProductPoint::new(vector));
+                docids.push(docid);
            }
-            log::debug!("There are {} entries in the HNSW so far", hnsw.len());
-            index.put_vector_hnsw(wtxn, &hnsw)?;
+
+            assert_eq!(docids.len(), points.len());
+
+            let hnsw_length = points.len();
+            let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
+
+            index.vector_id_docid.clear(wtxn)?;
+            for (docid, pid) in docids.into_iter().zip(pids) {
+                index.vector_id_docid.put(
+                    wtxn,
+                    &BEU32::new(pid.into_inner()),
+                    &BEU32::new(docid),
+                )?;
+            }
+
+            log::debug!("There are {} entries in the HNSW so far", hnsw_length);
+            index.put_vector_hnsw(wtxn, &new_hnsw)?;
        }
        TypedChunk::ScriptLanguageDocids(hash_pair) => {
            let mut buffer = Vec::new();
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
 pub use self::index_documents::{
-    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
-    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
+    merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
+    MergeFn,
 };
 pub use self::indexer_config::IndexerConfig;
 pub use self::prefix_word_pairs::{
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -466,13 +466,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                let current = self.index.stop_words(self.wtxn)?;

                // Apply an unlossy normalization on stop_words
-                let stop_words = stop_words
+                let stop_words: BTreeSet<String> = stop_words
                    .iter()
-                    .map(|w| w.as_str().normalize(&Default::default()).into_owned());
+                    .map(|w| w.as_str().normalize(&Default::default()).into_owned())
+                    .collect();

                // since we can't compare a BTreeSet with an FST we are going to convert the
                // BTreeSet to an FST and then compare bytes per bytes the two FSTs.
-                let fst = fst::Set::from_iter(stop_words)?;
+                let fst = fst::Set::from_iter(stop_words.into_iter())?;

                // Does the new FST differ from the previous one?
                if current