WIP arroy integration

2025-07-04 04:17:10 +02:00 · 2023-12-07 13:33:15 +01:00 · 2023-12-07 13:33:15 +01:00 · dde3a04679
commit dde3a04679
parent 13c2c6c16b
10 changed files with 280 additions and 326 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -22,7 +22,6 @@ use crate::heed_codec::{
    BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
 };
 use crate::proximity::ProximityPrecision;
-use crate::readable_slices::ReadableSlices;
 use crate::vector::EmbeddingConfig;
 use crate::{
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -49,10 +48,6 @@ pub mod main_key {
    pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
    pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
    pub const GEO_RTREE_KEY: &str = "geo-rtree";
-    /// The prefix of the key that is used to store the, potential big, HNSW structure.
-    /// It is concatenated with a big-endian encoded number (non-human readable).
-    /// e.g. vector-hnsw0x0032.
-    pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
    pub const PRIMARY_KEY_KEY: &str = "primary-key";
    pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
    pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
@ -75,6 +70,7 @@ pub mod main_key {
    pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
    pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
    pub const PROXIMITY_PRECISION: &str = "proximity-precision";
+    pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids";
    pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
 }

@ -102,6 +98,9 @@ pub mod db_name {
    pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
    pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
    pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
+    pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids";
+    pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
+    pub const VECTOR_ARROY: &str = "vector-arroy";
    pub const DOCUMENTS: &str = "documents";
    pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
 }
@ -168,8 +167,16 @@ pub struct Index {
    /// Maps the document id, the facet field id and the strings.
    pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,

-    /// Maps a vector id to the document id that have it.
+    /// Maps a vector id to its document id.
    pub vector_id_docid: Database<BEU32, BEU32>,
+    /// Maps a doc id to its vector ids.
+    pub docid_vector_ids: Database<BEU32, CboRoaringBitmapCodec>,
+
+    /// Maps an embedder name to its id in the arroy store.
+    pub embedder_category_id: Database<Str, BEU16>,
+
+    /// Vector store based on arroy™.
+    pub vector_arroy: arroy::Database<arroy::distances::DotProduct>,

    /// Maps the document id to the document as an obkv store.
    pub(crate) documents: Database<BEU32, ObkvCodec>,
@ -184,7 +191,7 @@ impl Index {
    ) -> Result<Index> {
        use db_name::*;

-        options.max_dbs(24);
+        options.max_dbs(27);

        let env = options.open(path)?;
        let mut wtxn = env.write_txn()?;
@ -224,7 +231,13 @@ impl Index {
            env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
        let field_id_docid_facet_strings =
            env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
+        // vector stuff
        let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
+        let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?;
+        let embedder_category_id =
+            env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
+        let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
+
        let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
        wtxn.commit()?;

@ -255,6 +268,9 @@ impl Index {
            field_id_docid_facet_f64s,
            field_id_docid_facet_strings,
            vector_id_docid,
+            vector_arroy,
+            docid_vector_ids,
+            embedder_category_id,
            documents,
        })
    }
@ -477,63 +493,6 @@ impl Index {
            None => Ok(RoaringBitmap::new()),
        }
    }
-
-    /* vector HNSW */
-
-    /// Writes the provided `hnsw`.
-    pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
-        // We must delete all the chunks before we write the new HNSW chunks.
-        self.delete_vector_hnsw(wtxn)?;
-
-        let chunk_size = 1024 * 1024 * (1024 + 512); // 1.5 GiB
-        let bytes = bincode::serialize(hnsw).map_err(Into::into).map_err(heed::Error::Encoding)?;
-        for (i, chunk) in bytes.chunks(chunk_size).enumerate() {
-            let i = i as u32;
-            let mut key = main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes().to_vec();
-            key.extend_from_slice(&i.to_be_bytes());
-            self.main.remap_types::<Bytes, Bytes>().put(wtxn, &key, chunk)?;
-        }
-        Ok(())
-    }
-
-    /// Delete the `hnsw`.
-    pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
-        let mut iter = self
-            .main
-            .remap_types::<Bytes, DecodeIgnore>()
-            .prefix_iter_mut(wtxn, main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes())?;
-        let mut deleted = false;
-        while iter.next().transpose()?.is_some() {
-            // We do not keep a reference to the key or the value.
-            unsafe { deleted |= iter.del_current()? };
-        }
-        Ok(deleted)
-    }
-
-    /// Returns the `hnsw`.
-    pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
-        let mut slices = Vec::new();
-        for result in self
-            .main
-            .remap_types::<Str, Bytes>()
-            .prefix_iter(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)?
-        {
-            let (_, slice) = result?;
-            slices.push(slice);
-        }
-
-        if slices.is_empty() {
-            Ok(None)
-        } else {
-            let readable_slices: ReadableSlices<_> = slices.into_iter().collect();
-            Ok(Some(
-                bincode::deserialize_from(readable_slices)
-                    .map_err(Into::into)
-                    .map_err(heed::Error::Decoding)?,
-            ))
-        }
-    }
-
    /* field distribution */

    /// Writes the field distribution which associates every field name with
@ -1557,6 +1516,30 @@ impl Index {
            .get(rtxn, main_key::EMBEDDING_CONFIGS)?
            .unwrap_or_default())
    }
+
+    pub(crate) fn put_unavailable_vector_ids(
+        &self,
+        wtxn: &mut RwTxn<'_>,
+        unavailable_vector_ids: RoaringBitmap,
+    ) -> heed::Result<()> {
+        self.main.remap_types::<Str, CboRoaringBitmapCodec>().put(
+            wtxn,
+            main_key::VECTOR_UNAVAILABLE_VECTOR_IDS,
+            &unavailable_vector_ids,
+        )
+    }
+
+    pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
+        self.main.remap_key_type::<Str>().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)
+    }
+
+    pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result<RoaringBitmap> {
+        Ok(self
+            .main
+            .remap_types::<Str, CboRoaringBitmapCodec>()
+            .get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)?
+            .unwrap_or_default())
+    }
 }

 #[cfg(test)]
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -19,7 +19,6 @@ pub mod heed_codec;
 pub mod index;
 pub mod prompt;
 pub mod proximity;
-mod readable_slices;
 pub mod score_details;
 mod search;
 pub mod update;
--- a/milli/src/readable_slices.rs
+++ b/milli/src/readable_slices.rs
@ -1,85 +0,0 @@
-use std::io::{self, Read};
-use std::iter::FromIterator;
-
-pub struct ReadableSlices<A> {
-    inner: Vec<A>,
-    pos: u64,
-}
-
-impl<A> FromIterator<A> for ReadableSlices<A> {
-    fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
-        ReadableSlices { inner: iter.into_iter().collect(), pos: 0 }
-    }
-}
-
-impl<A: AsRef<[u8]>> Read for ReadableSlices<A> {
-    fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
-        let original_buf_len = buf.len();
-
-        // We explore the list of slices to find the one where we must start reading.
-        let mut pos = self.pos;
-        let index = match self
-            .inner
-            .iter()
-            .map(|s| s.as_ref().len() as u64)
-            .position(|size| pos.checked_sub(size).map(|p| pos = p).is_none())
-        {
-            Some(index) => index,
-            None => return Ok(0),
-        };
-
-        let mut inner_pos = pos as usize;
-        for slice in &self.inner[index..] {
-            let slice = &slice.as_ref()[inner_pos..];
-
-            if buf.len() > slice.len() {
-                // We must exhaust the current slice and go to the next one there is not enough here.
-                buf[..slice.len()].copy_from_slice(slice);
-                buf = &mut buf[slice.len()..];
-                inner_pos = 0;
-            } else {
-                // There is enough in this slice to fill the remaining bytes of the buffer.
-                // Let's break just after filling it.
-                buf.copy_from_slice(&slice[..buf.len()]);
-                buf = &mut [];
-                break;
-            }
-        }
-
-        let written = original_buf_len - buf.len();
-        self.pos += written as u64;
-        Ok(written)
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::io::Read;
-
-    use super::ReadableSlices;
-
-    #[test]
-    fn basic() {
-        let data: Vec<_> = (0..100).collect();
-        let splits: Vec<_> = data.chunks(3).collect();
-        let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
-
-        let mut output = Vec::new();
-        let length = rdslices.read_to_end(&mut output).unwrap();
-        assert_eq!(length, data.len());
-        assert_eq!(output, data);
-    }
-
-    #[test]
-    fn small_reads() {
-        let data: Vec<_> = (0..u8::MAX).collect();
-        let splits: Vec<_> = data.chunks(27).collect();
-        let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
-
-        let buffer = &mut [0; 45];
-        let length = rdslices.read(buffer).unwrap();
-        let expected: Vec<_> = (0..buffer.len() as u8).collect();
-        assert_eq!(length, buffer.len());
-        assert_eq!(buffer, &expected[..]);
-    }
-}
--- a/milli/src/search/new/vector_sort.rs
+++ b/milli/src/search/new/vector_sort.rs
@ -11,64 +11,31 @@ use crate::index::Hnsw;
 use crate::score_details::{self, ScoreDetails};
 use crate::{Result, SearchContext, SearchLogger, UserError};

-pub struct VectorSort<Q: RankingRuleQueryTrait> {
+pub struct VectorSort<'ctx, Q: RankingRuleQueryTrait> {
    query: Option<Q>,
    target: Vec<f32>,
    vector_candidates: RoaringBitmap,
-    scope: nolife::DynBoxScope<SearchFamily>,
+    reader: arroy::Reader<'ctx, arroy::distances::DotProduct>,
+    limit: usize,
 }

-type Item<'a> = instant_distance::Item<'a, NDotProductPoint>;
-type SearchFut = Pin<Box<dyn Future<Output = nolife::Never>>>;
-
-struct SearchFamily;
-impl<'a> nolife::Family<'a> for SearchFamily {
-    type Family = Box<dyn Iterator<Item = Item<'a>> + 'a>;
-}
-
-async fn search_scope(
-    mut time_capsule: nolife::TimeCapsule<SearchFamily>,
-    hnsw: Hnsw,
-    target: Vec<f32>,
-) -> nolife::Never {
-    let mut search = instant_distance::Search::default();
-    let it = Box::new(hnsw.search(&NDotProductPoint::new(target), &mut search));
-    let mut it: Box<dyn Iterator<Item = Item>> = it;
-    loop {
-        time_capsule.freeze(&mut it).await;
-    }
-}
-
-impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
+impl<'ctx, Q: RankingRuleQueryTrait> VectorSort<'ctx, Q> {
    pub fn new(
-        ctx: &SearchContext,
+        ctx: &'ctx SearchContext,
        target: Vec<f32>,
        vector_candidates: RoaringBitmap,
+        limit: usize,
    ) -> Result<Self> {
-        let hnsw =
-            ctx.index.vector_hnsw(ctx.txn)?.unwrap_or(Hnsw::builder().build_hnsw(Vec::default()).0);
-
-        if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() {
-            if target.len() != expected_size {
-                return Err(UserError::InvalidVectorDimensions {
-                    expected: expected_size,
-                    found: target.len(),
-                }
-                .into());
-            }
-        }
+        /// FIXME? what to do in case of missing metadata
+        let reader = arroy::Reader::open(ctx.txn, 0, ctx.index.vector_arroy)?;

        let target_clone = target.clone();
-        let producer = move |time_capsule| -> SearchFut {
-            Box::pin(search_scope(time_capsule, hnsw, target_clone))
-        };
-        let scope = DynBoxScope::new(producer);

-        Ok(Self { query: None, target, vector_candidates, scope })
+        Ok(Self { query: None, target, vector_candidates, reader, limit })
    }
 }

-impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
+impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q> {
    fn id(&self) -> String {
        "vector_sort".to_owned()
    }
@ -108,11 +75,11 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
                }),
            }));
        }
-
-        let scope = &mut self.scope;
        let target = &self.target;
        let vector_candidates = &self.vector_candidates;

+        let result = self.reader.nns_by_vector(ctx.txn, &target, count, search_k, candidates)
+
        scope.enter(|it| {
            for item in it.by_ref() {
                let item: Item = item;
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -43,6 +43,9 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
            field_id_docid_facet_f64s,
            field_id_docid_facet_strings,
            vector_id_docid,
+            vector_arroy,
+            docid_vector_ids,
+            embedder_category_id: _,
            documents,
        } = self.index;

@ -58,7 +61,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
        self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
        self.index.delete_geo_rtree(self.wtxn)?;
        self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
-        self.index.delete_vector_hnsw(self.wtxn)?;

        // Clear the other databases.
        external_documents_ids.clear(self.wtxn)?;
@ -82,7 +84,11 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
        facet_id_string_docids.clear(self.wtxn)?;
        field_id_docid_facet_f64s.clear(self.wtxn)?;
        field_id_docid_facet_strings.clear(self.wtxn)?;
+        // vector
+        vector_arroy.clear(self.wtxn)?;
        vector_id_docid.clear(self.wtxn)?;
+        docid_vector_ids.clear(self.wtxn)?;
+
        documents.clear(self.wtxn)?;

        Ok(number_of_documents)
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -312,7 +312,8 @@ fn send_original_documents_data(
                lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
                    remove_vectors,
                    embeddings,
-                    expected_dimension,
+                    /// FIXME: compute an expected dimension from the manual vectors if any
+                    expected_dimension: expected_dimension.unwrap(),
                    manual_vectors,
                }))
            }
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -15,6 +15,7 @@ use crossbeam_channel::{Receiver, Sender};
 use heed::types::Str;
 use heed::Database;
 use log::debug;
+use rand::SeedableRng;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use slice_group_by::GroupBy;
@ -489,6 +490,9 @@ where
            }
        }

+        let writer = arroy::Writer::prepare(self.wtxn, self.index.vector_arroy, 0, 0)?;
+        writer.build(self.wtxn, &mut rand::rngs::StdRng::from_entropy(), None)?;
+
        // We write the field distribution into the main database
        self.index.put_field_distribution(self.wtxn, &field_distribution)?;

--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
@ -27,6 +27,7 @@ use crate::index::Hnsw;
 use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
+use crate::update::{available_documents_ids, AvailableDocumentsIds};
 use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError};

 pub(crate) enum TypedChunk {
@ -50,7 +51,7 @@ pub(crate) enum TypedChunk {
    VectorPoints {
        remove_vectors: grenad::Reader<BufReader<File>>,
        embeddings: Option<grenad::Reader<BufReader<File>>>,
-        expected_dimension: Option<usize>,
+        expected_dimension: usize,
        manual_vectors: grenad::Reader<BufReader<File>>,
    },
    ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
@ -106,7 +107,7 @@ impl TypedChunk {
                format!("GeoPoints {{ number_of_entries: {} }}", grenad.len())
            }
            TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension } => {
-                format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension.unwrap_or_default())
+                format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension)
            }
            TypedChunk::ScriptLanguageDocids(sl_map) => {
                format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
@ -373,46 +374,53 @@ pub(crate) fn write_typed_chunk_into_index(
                return Ok((RoaringBitmap::new(), is_merged_database));
            }

-            let mut docid_vectors_map: HashMap<DocumentId, HashSet<Vec<OrderedFloat<f32>>>> =
-                HashMap::new();
-
-            // We extract and store the previous vectors
-            if let Some(hnsw) = index.vector_hnsw(wtxn)? {
-                for (pid, point) in hnsw.iter() {
-                    let pid_key = pid.into_inner();
-                    let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap();
-                    let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
-                    docid_vectors_map.entry(docid).or_default().insert(vector);
-                }
-            }
+            let mut unavailable_vector_ids = index.unavailable_vector_ids(&wtxn)?;
+            /// FIXME: allow customizing distance
+            /// FIXME: allow customizing index
+            let writer = arroy::Writer::prepare(wtxn, index.vector_arroy, 0, expected_dimension)?;

            // remove vectors for docids we want them removed
            let mut cursor = remove_vectors.into_cursor()?;
            while let Some((key, _)) = cursor.move_on_next()? {
                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();

-                docid_vectors_map.remove(&docid);
+                let Some(to_remove_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? else {
+                    continue;
+                };
+                unavailable_vector_ids -= to_remove_vector_ids;
+
+                for item in to_remove_vector_ids {
+                    writer.del_item(wtxn, item)?;
+                }
            }

+            let mut available_vector_ids =
+                AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
            // add generated embeddings
-            if let Some((embeddings, expected_dimension)) = embeddings.zip(expected_dimension) {
+            if let Some(embeddings) = embeddings {
                let mut cursor = embeddings.into_cursor()?;
                while let Some((key, value)) = cursor.move_on_next()? {
                    let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
-                    let data: Vec<OrderedFloat<_>> =
-                        pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
+                    let data = pod_collect_to_vec(value);
                    // it is a code error to have embeddings and not expected_dimension
                    let embeddings =
                        crate::vector::Embeddings::from_inner(data, expected_dimension)
                            // code error if we somehow got the wrong dimension
                            .unwrap();

-                    let mut set = HashSet::new();
+                    let mut new_vector_ids = RoaringBitmap::new();
                    for embedding in embeddings.iter() {
-                        set.insert(embedding.to_vec());
-                    }
+                        /// FIXME: error when you get over 9000
+                        let next_vector_id = available_vector_ids.next().unwrap();
+                        unavailable_vector_ids.insert(next_vector_id);

-                    docid_vectors_map.insert(docid, set);
+                        new_vector_ids.insert(next_vector_id);
+
+                        index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
+
+                        writer.add_item(wtxn, next_vector_id, embedding)?;
+                    }
+                    index.docid_vector_ids.put(wtxn, &docid, &new_vector_ids)?;
                }
            }

@ -425,68 +433,44 @@ pub(crate) fn write_typed_chunk_into_index(

                let vector_deladd_obkv = KvReaderDelAdd::new(value);
                if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
-                    // convert the vector back to a Vec<f32>
-                    let vector: Vec<OrderedFloat<_>> =
-                        pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
-                    docid_vectors_map.entry(docid).and_modify(|v| {
-                        if !v.remove(&vector) {
-                            error!("Unable to delete the vector: {:?}", vector);
+                    let vector = pod_collect_to_vec(value);
+                    let Some(mut docid_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)?
+                    else {
+                        error!("Unable to delete the vector: {:?}", vector);
+                        continue;
+                    };
+                    for item in docid_vector_ids {
+                        /// FIXME: comparing the vectors by equality is inefficient, and dangerous by perfect equality
+                        let candidate = writer.item_vector(&wtxn, item)?.expect("Inconsistent dbs");
+                        if candidate == vector {
+                            writer.del_item(wtxn, item)?;
+                            unavailable_vector_ids.remove(item);
+                            index.vector_id_docid.delete(wtxn, &item)?;
+                            docid_vector_ids.remove(item);
+                            break;
                        }
-                    });
-                }
-                if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
-                    // convert the vector back to a Vec<f32>
-                    let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
-                    docid_vectors_map.entry(docid).and_modify(|v| {
-                        v.insert(vector);
-                    });
-                }
-            }
-
-            // Extract the most common vector dimension
-            let expected_dimension_size = {
-                let mut dims = HashMap::new();
-                docid_vectors_map
-                    .values()
-                    .flat_map(|v| v.iter())
-                    .for_each(|v| *dims.entry(v.len()).or_insert(0) += 1);
-                dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
-            };
-
-            // Ensure that the vector lengths are correct and
-            // prepare the vectors before inserting them in the HNSW.
-            let mut points = Vec::new();
-            let mut docids = Vec::new();
-            for (docid, vector) in docid_vectors_map
-                .into_iter()
-                .flat_map(|(docid, vectors)| std::iter::repeat(docid).zip(vectors))
-            {
-                if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
-                    return Err(UserError::InvalidVectorDimensions {
-                        expected: expected_dimension_size.unwrap_or(vector.len()),
-                        found: vector.len(),
                    }
-                    .into());
-                } else {
-                    let vector = vector.into_iter().map(OrderedFloat::into_inner).collect();
-                    points.push(NDotProductPoint::new(vector));
-                    docids.push(docid);
+                    index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
+                }
+                let mut available_vector_ids =
+                    AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
+
+                if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
+                    let vector = pod_collect_to_vec(value);
+                    let next_vector_id = available_vector_ids.next().unwrap();
+
+                    writer.add_item(wtxn, next_vector_id, &vector)?;
+                    unavailable_vector_ids.insert(next_vector_id);
+                    index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
+                    let mut docid_vector_ids =
+                        index.docid_vector_ids.get(&wtxn, &docid)?.unwrap_or_default();
+                    docid_vector_ids.insert(next_vector_id);
+                    index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
                }
            }

-            let hnsw_length = points.len();
-            let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
-
-            assert_eq!(docids.len(), pids.len());
-
-            // Store the vectors in the point-docid relation database
-            index.vector_id_docid.clear(wtxn)?;
-            for (docid, pid) in docids.into_iter().zip(pids) {
-                index.vector_id_docid.put(wtxn, &pid.into_inner(), &docid)?;
-            }
-
-            log::debug!("There are {} entries in the HNSW so far", hnsw_length);
-            index.put_vector_hnsw(wtxn, &new_hnsw)?;
+            log::debug!("There are {} entries in the arroy so far", unavailable_vector_ids.len());
+            index.put_unavailable_vector_ids(wtxn, unavailable_vector_ids)?;
        }
        TypedChunk::ScriptLanguageDocids(sl_map) => {
            for (key, (deletion, addition)) in sl_map {