From 8fff5fc28105c6ca34c28b8fc9e7af31c75cb416 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Mar 2024 12:11:46 +0100
Subject: [PATCH 01/16] update tests

---
 milli/src/update/settings.rs | 64 ++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index beca4fe51..569938ccf 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1643,6 +1643,70 @@ mod tests {
             .unwrap()
             .count();
         assert_eq!(count, 4);
+
+        // Set the filterable fields to be the age and the name.
+        index
+            .update_settings(|settings| {
+                settings.set_filterable_fields(hashset! { S("age"),  S("name") });
+            })
+            .unwrap();
+
+        // Check that the displayed fields are correctly set.
+        let rtxn = index.read_txn().unwrap();
+        let fields_ids = index.filterable_fields(&rtxn).unwrap();
+        assert_eq!(fields_ids, hashset! { S("age"),  S("name") });
+
+        let rtxn = index.read_txn().unwrap();
+        // Only count the field_id 0 and level 0 facet values.
+        let count = index
+            .facet_id_f64_docids
+            .remap_key_type::<Bytes>()
+            .prefix_iter(&rtxn, &[0, 1, 0])
+            .unwrap()
+            .count();
+        assert_eq!(count, 4);
+
+        let rtxn = index.read_txn().unwrap();
+        // Only count the field_id 0 and level 0 facet values.
+        let count = index
+            .facet_id_string_docids
+            .remap_key_type::<Bytes>()
+            .prefix_iter(&rtxn, &[0, 1, 0])
+            .unwrap()
+            .count();
+        assert_eq!(count, 5);
+
+        // Remove the age from the filterable fields.
+        index
+            .update_settings(|settings| {
+                settings.set_filterable_fields(hashset! { S("name") });
+            })
+            .unwrap();
+
+        // Check that the displayed fields are correctly set.
+        let rtxn = index.read_txn().unwrap();
+        let fields_ids = index.filterable_fields(&rtxn).unwrap();
+        assert_eq!(fields_ids, hashset! { S("name") });
+
+        let rtxn = index.read_txn().unwrap();
+        // Only count the field_id 0 and level 0 facet values.
+        let count = index
+            .facet_id_f64_docids
+            .remap_key_type::<Bytes>()
+            .prefix_iter(&rtxn, &[0, 1, 0])
+            .unwrap()
+            .count();
+        assert_eq!(count, 0);
+
+        let rtxn = index.read_txn().unwrap();
+        // Only count the field_id 0 and level 0 facet values.
+        let count = index
+            .facet_id_string_docids
+            .remap_key_type::<Bytes>()
+            .prefix_iter(&rtxn, &[0, 1, 0])
+            .unwrap()
+            .count();
+        assert_eq!(count, 5);
     }
 
     #[test]

From 64079fc8946c1a86c0defe2f098eb84a5fc7d202 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Mar 2024 12:17:49 +0100
Subject: [PATCH 02/16] Do more iterations on the settings benchmarks

---
 workloads/settings-add-remove-filters.json         | 2 +-
 workloads/settings-proximity-precision.json        | 2 +-
 workloads/settings-remove-add-swap-searchable.json | 2 +-
 workloads/settings-typo.json                       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/workloads/settings-add-remove-filters.json b/workloads/settings-add-remove-filters.json
index 04a57c707..12493a8fc 100644
--- a/workloads/settings-add-remove-filters.json
+++ b/workloads/settings-add-remove-filters.json
@@ -1,6 +1,6 @@
 {
   "name": "settings-add-remove-filters.json",
-  "run_count": 2,
+  "run_count": 5,
   "extra_cli_args": [
     "--max-indexing-threads=4"
   ],
diff --git a/workloads/settings-proximity-precision.json b/workloads/settings-proximity-precision.json
index 48cfad49d..384f99e37 100644
--- a/workloads/settings-proximity-precision.json
+++ b/workloads/settings-proximity-precision.json
@@ -1,6 +1,6 @@
 {
   "name": "settings-proximity-precision.json",
-  "run_count": 2,
+  "run_count": 5,
   "extra_cli_args": [
     "--max-indexing-threads=4"
   ],
diff --git a/workloads/settings-remove-add-swap-searchable.json b/workloads/settings-remove-add-swap-searchable.json
index ba315680f..61db8822e 100644
--- a/workloads/settings-remove-add-swap-searchable.json
+++ b/workloads/settings-remove-add-swap-searchable.json
@@ -1,6 +1,6 @@
 {
   "name": "settings-remove-add-swap-searchable.json",
-  "run_count": 2,
+  "run_count": 5,
   "extra_cli_args": [
     "--max-indexing-threads=4"
   ],
diff --git a/workloads/settings-typo.json b/workloads/settings-typo.json
index a272e6d1f..45163bc98 100644
--- a/workloads/settings-typo.json
+++ b/workloads/settings-typo.json
@@ -1,6 +1,6 @@
 {
   "name": "settings-typo.json",
-  "run_count": 2,
+  "run_count": 5,
   "extra_cli_args": [
     "--max-indexing-threads=4"
   ],

From aabce52b1b6d7ddcbe8fdefeda84857dfc8b2d2d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Mar 2024 14:20:46 +0100
Subject: [PATCH 03/16] Fix test

---
 milli/src/update/settings.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 569938ccf..be5a449b9 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1671,7 +1671,7 @@ mod tests {
         let count = index
             .facet_id_string_docids
             .remap_key_type::<Bytes>()
-            .prefix_iter(&rtxn, &[0, 1, 0])
+            .prefix_iter(&rtxn, &[0, 0])
             .unwrap()
             .count();
         assert_eq!(count, 5);
@@ -1703,7 +1703,7 @@ mod tests {
         let count = index
             .facet_id_string_docids
             .remap_key_type::<Bytes>()
-            .prefix_iter(&rtxn, &[0, 1, 0])
+            .prefix_iter(&rtxn, &[0, 0])
             .unwrap()
             .count();
         assert_eq!(count, 5);

From 893200ab87ac77aaeeebf0d585bb1fbc31adf327 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Mar 2024 14:33:32 +0100
Subject: [PATCH 04/16] Avoid clearing documents in transform

---
 milli/src/update/index_documents/transform.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index e5392092f..09bf94ace 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -23,7 +23,7 @@ use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
 use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
 use crate::update::index_documents::GrenadParameters;
-use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
+use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result};
 
 pub struct TransformOutput {
@@ -875,7 +875,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             document_sorter_value_buffer.clear();
             into_del_add_obkv(
                 KvReaderU16::new(buffer),
-                DelAddOperation::Addition,
+                DelAddOperation::DeletionAndAddition,
                 &mut document_sorter_value_buffer,
             )?;
             original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
@@ -916,7 +916,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             document_sorter_value_buffer.clear();
             into_del_add_obkv(
                 KvReaderU16::new(&buffer),
-                DelAddOperation::Addition,
+                DelAddOperation::DeletionAndAddition,
                 &mut document_sorter_value_buffer,
             )?;
             flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
@@ -946,9 +946,6 @@ impl<'a, 'i> Transform<'a, 'i> {
         let new_facets = output.compute_real_facets(wtxn, self.index)?;
         self.index.put_faceted_fields(wtxn, &new_facets)?;
 
-        // We clear the full database (words-fst, documents ids and documents content).
-        ClearDocuments::new(wtxn, self.index).execute()?;
-
         Ok(output)
     }
 }

From a7e368aaa6fd73747d07d2fc441819bf7f52bd37 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 20 Mar 2024 13:37:26 +0100
Subject: [PATCH 05/16] Create InnerIndexSettingsDiffs struct and populate it

---
 milli/src/update/settings.rs | 167 ++++++++++++++++++++++++++++-------
 1 file changed, 133 insertions(+), 34 deletions(-)

diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index be5a449b9..5b1788242 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -20,7 +20,7 @@ use crate::update::index_documents::IndexDocumentsMethod;
 use crate::update::{IndexDocuments, UpdateIndexingStep};
 use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
 use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
-use crate::{FieldsIdsMap, Index, Result};
+use crate::{FieldId, FieldsIdsMap, Index, Result};
 
 #[derive(Debug, Clone, PartialEq, Eq, Copy)]
 pub enum Setting<T> {
@@ -1066,20 +1066,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     {
         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
 
-        // Note: this MUST be before `update_sortable` so that we can get the old value to compare with the updated value afterwards
-
-        let existing_fields: HashSet<_> = self
-            .index
-            .field_distribution(self.wtxn)?
-            .into_iter()
-            .filter_map(|(field, count)| (count != 0).then_some(field))
-            .collect();
-        let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
+        let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?;
         let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
 
+        // never trigger re-indexing
         self.update_displayed()?;
-        self.update_filterable()?;
-        self.update_sortable()?;
         self.update_distinct_field()?;
         self.update_criteria()?;
         self.update_primary_key()?;
@@ -1089,16 +1080,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         self.update_max_values_per_facet()?;
         self.update_sort_facet_values_by()?;
         self.update_pagination_max_total_hits()?;
+        self.update_search_cutoff()?;
 
-        let faceted_updated = self.update_faceted(existing_fields, old_faceted_fields)?;
-        let stop_words_updated = self.update_stop_words()?;
-        let non_separator_tokens_updated = self.update_non_separator_tokens()?;
-        let separator_tokens_updated = self.update_separator_tokens()?;
-        let dictionary_updated = self.update_dictionary()?;
-        let synonyms_updated = self.update_synonyms()?;
-        let searchable_updated = self.update_searchable()?;
-        let exact_attributes_updated = self.update_exact_attributes()?;
-        let proximity_precision = self.update_proximity_precision()?;
+        // could trigger re-indexing
+        self.update_filterable()?;
+        self.update_sortable()?;
+        self.update_stop_words()?;
+        self.update_non_separator_tokens()?;
+        self.update_separator_tokens()?;
+        self.update_dictionary()?;
+        self.update_synonyms()?;
+        self.update_searchable()?;
+        self.update_exact_attributes()?;
+        self.update_proximity_precision()?;
         // TODO: very rough approximation of the needs for reindexing where any change will result in
         // a full reindexing.
         // What can be done instead:
@@ -1107,20 +1101,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
         let embedding_configs_updated = self.update_embedding_configs()?;
 
-        // never trigger re-indexing
-        self.update_search_cutoff()?;
+        let new_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?;
+        let inner_settings_diff = InnerIndexSettingsDiff {
+            old: old_inner_settings,
+            new: new_inner_settings,
+            embedding_configs_updated,
+        };
 
-        if stop_words_updated
-            || non_separator_tokens_updated
-            || separator_tokens_updated
-            || dictionary_updated
-            || faceted_updated
-            || synonyms_updated
-            || searchable_updated
-            || exact_attributes_updated
-            || proximity_precision
-            || embedding_configs_updated
-        {
+        if inner_settings_diff.any_reindexing_needed() {
             self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
         }
 
@@ -1156,6 +1144,117 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     }
 }
 
+pub(crate) struct InnerIndexSettingsDiff {
+    old: InnerIndexSettings,
+    new: InnerIndexSettings,
+
+    // TODO: compare directly the embedders.
+    embedding_configs_updated: bool,
+}
+
+impl InnerIndexSettingsDiff {
+    fn any_reindexing_needed(&self) -> bool {
+        self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
+    }
+
+    fn reindex_searchable(&self) -> bool {
+        self.old
+            .fields_ids_map
+            .iter()
+            .zip(self.new.fields_ids_map.iter())
+            .any(|(old, new)| old != new)
+            || self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
+                != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
+            || self.old.allowed_separators != self.new.allowed_separators
+            || self.old.dictionary != self.new.dictionary
+            || self.old.searchable_fields != self.new.searchable_fields
+            || self.old.exact_attributes != self.new.exact_attributes
+            || self.old.proximity_precision != self.new.proximity_precision
+    }
+
+    fn reindex_facets(&self) -> bool {
+        let existing_fields = self.new.existing_fields;
+        if existing_fields.iter().any(|field| field.contains('.')) {
+            return true;
+        }
+
+        let old_faceted_fields = self.old.user_defined_faceted_fields;
+        if old_faceted_fields.iter().any(|field| field.contains('.')) {
+            return true;
+        }
+
+        // If there is new faceted fields we indicate that we must reindex as we must
+        // index new fields as facets. It means that the distinct attribute,
+        // an Asc/Desc criterion or a filtered attribute as be added or removed.
+        let new_faceted_fields = self.new.user_defined_faceted_fields;
+        if new_faceted_fields.iter().any(|field| field.contains('.')) {
+            return true;
+        }
+
+        let faceted_updated =
+            (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
+
+        self.old
+            .fields_ids_map
+            .iter()
+            .zip(self.new.fields_ids_map.iter())
+            .any(|(old, new)| old != new)
+            || faceted_updated
+    }
+
+    fn reindex_vectors(&self) -> bool {
+        self.embedding_configs_updated
+    }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct InnerIndexSettings {
+    stop_words: Option<fst::Set<Vec<u8>>>,
+    allowed_separators: Option<BTreeSet<String>>,
+    dictionary: Option<BTreeSet<String>>,
+    fields_ids_map: FieldsIdsMap,
+    faceted_fields: HashSet<FieldId>,
+    searchable_fields: Option<BTreeSet<FieldId>>,
+    exact_attributes: HashSet<FieldId>,
+    proximity_precision: ProximityPrecision,
+    embedding_configs: Vec<(String, crate::vector::EmbeddingConfig)>,
+    existing_fields: HashSet<String>,
+}
+
+impl InnerIndexSettings {
+    fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> {
+        let stop_words = index.stop_words(rtxn)?;
+        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
+        let allowed_separators = index.allowed_separators(rtxn)?;
+        let dictionary = index.dictionary(rtxn)?;
+        let fields_ids_map = index.fields_ids_map(rtxn)?;
+        let searchable_fields = index.searchable_fields_ids(rtxn)?;
+        let searchable_fields = searchable_fields.map(|sf| sf.into_iter().collect());
+        let faceted_fields = index.faceted_fields_ids(rtxn)?;
+        let exact_attributes = index.exact_attributes_ids(rtxn)?;
+        let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
+        let embedding_configs = index.embedding_configs(rtxn)?;
+        let existing_fields: HashSet<_> = index
+            .field_distribution(rtxn)?
+            .into_iter()
+            .filter_map(|(field, count)| (count != 0).then_some(field))
+            .collect();
+
+        Ok(Self {
+            stop_words,
+            allowed_separators,
+            dictionary,
+            fields_ids_map,
+            faceted_fields,
+            searchable_fields,
+            exact_attributes,
+            proximity_precision,
+            embedding_configs,
+            existing_fields,
+        })
+    }
+}
+
 fn validate_prompt(
     name: &str,
     new: Setting<EmbeddingSettings>,

From b5e4a55af694e19f9748d16c3f1b5a7da878361f Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 26 Mar 2024 13:27:43 +0100
Subject: [PATCH 06/16] refactor faceted and searchable pipeline

---
 milli/src/index.rs                            |  21 ++-
 milli/src/update/del_add.rs                   |   4 +-
 .../extract/extract_docid_word_positions.rs   |  88 +++++----
 .../extract/extract_facet_number_docids.rs    |   2 +
 .../extract/extract_facet_string_docids.rs    |   2 +
 .../extract/extract_fid_docid_facet_values.rs |  14 +-
 .../extract/extract_fid_word_count_docids.rs  |   2 +
 .../extract/extract_word_docids.rs            | 156 +++++++++++----
 .../extract_word_pair_proximity_docids.rs     |   2 +
 .../extract/extract_word_position_docids.rs   |   2 +
 .../src/update/index_documents/extract/mod.rs |  78 +++-----
 milli/src/update/index_documents/mod.rs       |  45 +----
 milli/src/update/index_documents/transform.rs | 178 ++++++++----------
 milli/src/update/settings.rs                  | 165 ++++++++--------
 14 files changed, 420 insertions(+), 339 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index db31c953a..27b273393 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -678,6 +678,23 @@ impl Index {
             .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY)
     }
 
+    /// Identical to `user_defined_searchable_fields`, but returns ids instead.
+    pub fn user_defined_searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> {
+        match self.user_defined_searchable_fields(rtxn)? {
+            Some(fields) => {
+                let fields_ids_map = self.fields_ids_map(rtxn)?;
+                let mut fields_ids = Vec::new();
+                for name in fields {
+                    if let Some(field_id) = fields_ids_map.id(name) {
+                        fields_ids.push(field_id);
+                    }
+                }
+                Ok(Some(fields_ids))
+            }
+            None => Ok(None),
+        }
+    }
+
     /* filterable fields */
 
     /// Writes the filterable fields names in the database.
@@ -824,11 +841,11 @@ impl Index {
 
     /// Identical to `user_defined_faceted_fields`, but returns ids instead.
     pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
-        let fields = self.faceted_fields(rtxn)?;
+        let fields = self.user_defined_faceted_fields(rtxn)?;
         let fields_ids_map = self.fields_ids_map(rtxn)?;
 
         let mut fields_ids = HashSet::new();
-        for name in fields.into_iter() {
+        for name in fields {
             if let Some(field_id) = fields_ids_map.id(&name) {
                 fields_ids.insert(field_id);
             }
diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index 794beb5df..0288858ed 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -71,8 +71,8 @@ pub enum DelAddOperation {
 /// putting each deletion obkv's keys under an DelAdd::Deletion
 /// and putting each addition obkv's keys under an DelAdd::Addition
 pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
-    deletion: obkv::KvReader<K>,
-    addition: obkv::KvReader<K>,
+    deletion: &obkv::KvReader<K>,
+    addition: &obkv::KvReader<K>,
     buffer: &mut Vec<u8>,
 ) -> Result<(), std::io::Error> {
     use itertools::merge_join_by;
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index dc4886f00..b1a6bb5a6 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::BufReader;
@@ -12,6 +12,7 @@ use serde_json::Value;
 use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
 use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
+use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
 use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
 
 pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
@@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R
 pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
-    searchable_fields: &Option<HashSet<FieldId>>,
-    stop_words: Option<&fst::Set<Vec<u8>>>,
-    allowed_separators: Option<&[&str]>,
-    dictionary: Option<&[&str]>,
+    settings_diff: &InnerIndexSettingsDiff,
     max_positions_per_attributes: Option<u32>,
 ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
     puffin::profile_function!();
@@ -56,8 +54,33 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let mut value_buffer = Vec::new();
 
     // initialize tokenizer.
-    let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
-    let tokenizer = builder.build();
+    // TODO: Fix ugly allocation
+    let old_stop_words = settings_diff.old.stop_words.as_ref();
+    let old_separators: Option<Vec<_>> =
+        settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+    let old_dictionary: Option<Vec<_>> =
+        settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
+    let mut del_builder = tokenizer_builder(
+        old_stop_words,
+        old_separators.as_deref(),
+        old_dictionary.as_deref(),
+        None,
+    );
+    let del_tokenizer = del_builder.build();
+
+    // TODO: Fix ugly allocation
+    let new_stop_words = settings_diff.new.stop_words.as_ref();
+    let new_separators: Option<Vec<_>> =
+        settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+    let new_dictionary: Option<Vec<_>> =
+        settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
+    let mut add_builder = tokenizer_builder(
+        new_stop_words,
+        new_separators.as_deref(),
+        new_dictionary.as_deref(),
+        None,
+    );
+    let add_tokenizer = add_builder.build();
 
     // iterate over documents.
     let mut cursor = obkv_documents.into_cursor()?;
@@ -69,7 +92,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         let obkv = KvReader::<FieldId>::new(value);
 
         // if the searchable fields didn't change, skip the searchable indexing for this document.
-        if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
+        if !searchable_fields_changed(
+            &KvReader::<FieldId>::new(value),
+            &settings_diff.new.searchable_fields_ids,
+        ) {
             continue;
         }
 
@@ -85,11 +111,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                 // deletions
                 lang_safe_tokens_from_document(
                     &obkv,
-                    searchable_fields,
-                    &tokenizer,
-                    stop_words,
-                    allowed_separators,
-                    dictionary,
+                    &settings_diff.old,
+                    &del_tokenizer,
                     max_positions_per_attributes,
                     DelAdd::Deletion,
                     &mut del_buffers,
@@ -99,11 +122,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                 // additions
                 lang_safe_tokens_from_document(
                     &obkv,
-                    searchable_fields,
-                    &tokenizer,
-                    stop_words,
-                    allowed_separators,
-                    dictionary,
+                    &settings_diff.new,
+                    &add_tokenizer,
                     max_positions_per_attributes,
                     DelAdd::Addition,
                     &mut add_buffers,
@@ -118,8 +138,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
         value_buffer.clear();
         del_add_from_two_obkvs(
-            KvReader::<FieldId>::new(del_obkv),
-            KvReader::<FieldId>::new(add_obkv),
+            &KvReader::<FieldId>::new(del_obkv),
+            &KvReader::<FieldId>::new(add_obkv),
             &mut value_buffer,
         )?;
 
@@ -160,7 +180,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
 /// Check if any searchable fields of a document changed.
 fn searchable_fields_changed(
     obkv: &KvReader<FieldId>,
-    searchable_fields: &Option<HashSet<FieldId>>,
+    searchable_fields: &Option<Vec<FieldId>>,
 ) -> bool {
     for (field_id, field_bytes) in obkv.iter() {
         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
@@ -206,14 +226,10 @@ fn tokenizer_builder<'a>(
 
 /// Extract words mapped with their positions of a document,
 /// ensuring no Language detection mistakes was made.
-#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
 fn lang_safe_tokens_from_document<'a>(
     obkv: &KvReader<FieldId>,
-    searchable_fields: &Option<HashSet<FieldId>>,
+    settings: &InnerIndexSettings,
     tokenizer: &Tokenizer,
-    stop_words: Option<&fst::Set<Vec<u8>>>,
-    allowed_separators: Option<&[&str]>,
-    dictionary: Option<&[&str]>,
     max_positions_per_attributes: u32,
     del_add: DelAdd,
     buffers: &'a mut Buffers,
@@ -222,7 +238,7 @@ fn lang_safe_tokens_from_document<'a>(
 
     tokens_from_document(
         obkv,
-        searchable_fields,
+        &settings.searchable_fields_ids,
         tokenizer,
         max_positions_per_attributes,
         del_add,
@@ -246,12 +262,14 @@ fn lang_safe_tokens_from_document<'a>(
         // then we don't rerun the extraction.
         if !script_language.is_empty() {
             // build a new temporary tokenizer including the allow list.
-            let mut builder = tokenizer_builder(
-                stop_words,
-                allowed_separators,
-                dictionary,
-                Some(&script_language),
-            );
+            // TODO: Fix ugly allocation
+            let stop_words = settings.stop_words.as_ref();
+            let separators: Option<Vec<_>> =
+                settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+            let dictionary: Option<Vec<_>> =
+                settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
+            let mut builder =
+                tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
             let tokenizer = builder.build();
 
             script_language_word_count.clear();
@@ -259,7 +277,7 @@ fn lang_safe_tokens_from_document<'a>(
             // rerun the extraction.
             tokens_from_document(
                 obkv,
-                searchable_fields,
+                &settings.searchable_fields_ids,
                 &tokenizer,
                 max_positions_per_attributes,
                 del_add,
@@ -276,7 +294,7 @@ fn lang_safe_tokens_from_document<'a>(
 /// Extract words mapped with their positions of a document.
 fn tokens_from_document<'a>(
     obkv: &KvReader<FieldId>,
-    searchable_fields: &Option<HashSet<FieldId>>,
+    searchable_fields: &Option<Vec<FieldId>>,
     tokenizer: &Tokenizer,
     max_positions_per_attributes: u32,
     del_add: DelAdd,
diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
index 33def5abd..1848a085f 100644
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -10,6 +10,7 @@ use crate::heed_codec::facet::{
     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
 };
 use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::Result;
 
 /// Extracts the facet number and the documents ids where this facet number appear.
@@ -20,6 +21,7 @@ use crate::Result;
 pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
     fid_docid_facet_number: grenad::Reader<R>,
     indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
 
diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
index 8fdd11ee7..abffe17ab 100644
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::{
     merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
 };
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 
 /// Extracts the facet string and the documents ids where this facet string appear.
@@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
     docid_fid_facet_string: grenad::Reader<R>,
     indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
     puffin::profile_function!();
 
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 1f8af372d..030303cd9 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{BTreeMap, HashSet};
+use std::collections::BTreeMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
@@ -20,6 +20,7 @@ use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
 use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{create_writer, writer_into_reader};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
 
 /// The length of the elements that are always in the buffer when inserting new values.
@@ -43,7 +44,7 @@ pub struct ExtractedFacetValues {
 pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
-    faceted_fields: &HashSet<FieldId>,
+    settings_diff: &InnerIndexSettingsDiff,
     geo_fields_ids: Option<(FieldId, FieldId)>,
 ) -> Result<ExtractedFacetValues> {
     puffin::profile_function!();
@@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         let obkv = obkv::KvReader::new(value);
 
         for (field_id, field_bytes) in obkv.iter() {
-            if faceted_fields.contains(&field_id) {
+            let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
+            let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
+            if delete_faceted || add_faceted {
                 numbers_key_buffer.clear();
                 strings_key_buffer.clear();
 
@@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                 strings_key_buffer.extend_from_slice(docid_bytes);
 
                 let del_add_obkv = obkv::KvReader::new(field_bytes);
-                let del_value = match del_add_obkv.get(DelAdd::Deletion) {
+                let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
+                {
                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
                     None => None,
                 };
-                let add_value = match del_add_obkv.get(DelAdd::Addition) {
+                let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
                     None => None,
                 };
diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
index 305af3630..51e0642da 100644
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -10,6 +10,7 @@ use super::helpers::{
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::Result;
 
 const MAX_COUNTED_WORDS: usize = 30;
@@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30;
 pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
 
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index f38701dac..2b1f02326 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -1,20 +1,22 @@
-use std::collections::{BTreeSet, HashSet};
+use std::collections::BTreeSet;
 use std::fs::File;
 use std::io::{self, BufReader};
 
-use heed::BytesDecode;
+use heed::{BytesDecode, BytesEncode};
 use obkv::KvReaderU16;
+use roaring::RoaringBitmap;
 
 use super::helpers::{
-    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
-    try_split_array_at, writer_into_reader, GrenadParameters,
+    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
+    writer_into_reader, GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::update::MergeFn;
-use crate::{DocumentId, FieldId, Result};
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
 
 /// Extracts the word and the documents ids where this word appear.
 ///
@@ -27,7 +29,7 @@ use crate::{DocumentId, FieldId, Result};
 pub fn extract_word_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
-    exact_attributes: &HashSet<FieldId>,
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<(
     grenad::Reader<BufReader<File>>,
     grenad::Reader<BufReader<File>>,
@@ -43,7 +45,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|x| x / 3),
+        max_memory,
     );
     let mut key_buffer = Vec::new();
     let mut del_words = BTreeSet::new();
@@ -85,30 +87,29 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         add_words.clear();
     }
 
-    let mut word_docids_sorter = create_sorter(
-        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory.map(|x| x / 3),
-    );
-
-    let mut exact_word_docids_sorter = create_sorter(
-        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory.map(|x| x / 3),
-    );
-
     let mut word_fid_docids_writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
 
+    let mut word_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+
+    let mut exact_word_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+
+    let mut word: Option<String> = None;
+    let mut deletions = RoaringBitmap::new();
+    let mut additions = RoaringBitmap::new();
+    let mut exact_deletions = RoaringBitmap::new();
+    let mut exact_additions = RoaringBitmap::new();
     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
     // TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
     while let Some((key, value)) = iter.next()? {
@@ -117,20 +118,69 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
             word_fid_docids_writer.insert(key, value)?;
         }
 
-        let (word, fid) = StrBEU16Codec::bytes_decode(key)
+        let (w, fid) = StrBEU16Codec::bytes_decode(key)
             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
 
-        // every words contained in an attribute set to exact must be pushed in the exact_words list.
-        if exact_attributes.contains(&fid) {
-            exact_word_docids_sorter.insert(word.as_bytes(), value)?;
+        if let Some(word) = word {
+            if word.as_str() != w {
+                docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
+                docids_into_writers(
+                    &word,
+                    &exact_deletions,
+                    &exact_additions,
+                    &mut exact_word_docids_writer,
+                );
+                let word = Some(w.to_string());
+                // clear buffers
+                deletions.clear();
+                additions.clear();
+                exact_deletions.clear();
+                exact_additions.clear();
+            }
         } else {
-            word_docids_sorter.insert(word.as_bytes(), value)?;
+            let word = Some(w.to_string());
+        }
+
+        // merge all deletions
+        let obkv = KvReaderDelAdd::new(value);
+        if let Some(value) = obkv.get(DelAdd::Deletion) {
+            let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
+            let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
+                SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?;
+            if delete_from_exact {
+                exact_deletions |= docids;
+            } else {
+                deletions |= docids
+            }
+        }
+        // merge all additions
+        if let Some(value) = obkv.get(DelAdd::Addition) {
+            let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
+            let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
+                SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?;
+            if add_in_exact {
+                exact_additions |= docids;
+            } else {
+                additions |= docids
+            }
         }
     }
 
+    if let Some(word) = word {
+        docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
+        docids_into_writers(
+            &word,
+            &exact_deletions,
+            &exact_additions,
+            &mut exact_word_docids_writer,
+        );
+    }
+
     Ok((
-        sorter_into_reader(word_docids_sorter, indexer)?,
-        sorter_into_reader(exact_word_docids_sorter, indexer)?,
+        writer_into_reader(word_docids_writer)?,
+        writer_into_reader(exact_word_docids_writer)?,
         writer_into_reader(word_fid_docids_writer)?,
     ))
 }
@@ -178,3 +228,45 @@ fn words_into_sorter(
 
     Ok(())
 }
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn docids_into_writers<W>(
+    word: &str,
+    deletions: &RoaringBitmap,
+    additions: &RoaringBitmap,
+    writer: &mut grenad::Writer<W>,
+) -> Result<()>
+where
+    W: std::io::Write,
+{
+    if deletions == additions {
+        // if the same value is deleted and added, do nothing.
+        return Ok(());
+    }
+
+    // Write each value in the same KvDelAdd before inserting it in the final writer.
+    let mut obkv = KvWriterDelAdd::memory();
+    // deletions:
+    if !deletions.is_empty() && !deletions.is_subset(additions) {
+        obkv.insert(
+            DelAdd::Deletion,
+            CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
+                SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?,
+        );
+    }
+    // additions:
+    if !additions.is_empty() {
+        obkv.insert(
+            DelAdd::Addition,
+            CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
+                SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?,
+        );
+    }
+
+    // insert everything in the same writer.
+    writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
+
+    Ok(())
+}
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 82a94ce00..d86d09bc8 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -13,6 +13,7 @@ use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::proximity::{index_proximity, MAX_DISTANCE};
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{DocumentId, Result};
 
 /// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@@ -23,6 +24,7 @@ use crate::{DocumentId, Result};
 pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
 
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 4bc553d9a..45a05b0d0 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -11,6 +11,7 @@ use super::helpers::{
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::update::MergeFn;
 use crate::{bucketed_position, DocumentId, Result};
 
@@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result};
 pub fn extract_word_position_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
 
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 82486f3a8..a6b73efde 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -31,8 +31,8 @@ use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
 use super::{helpers, TypedChunk};
 use crate::proximity::ProximityPrecision;
-use crate::vector::EmbeddingConfigs;
-use crate::{FieldId, FieldsIdsMap, Result};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{FieldId, Result};
 
 /// Extract data for each databases from obkv documents in parallel.
 /// Send data in grenad file over provided Sender.
@@ -43,18 +43,10 @@ pub(crate) fn data_from_obkv_documents(
     flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    searchable_fields: Option<HashSet<FieldId>>,
-    faceted_fields: HashSet<FieldId>,
     primary_key_id: FieldId,
     geo_fields_ids: Option<(FieldId, FieldId)>,
-    field_id_map: FieldsIdsMap,
-    stop_words: Option<fst::Set<Vec<u8>>>,
-    allowed_separators: Option<&[&str]>,
-    dictionary: Option<&[&str]>,
+    settings_diff: &InnerIndexSettingsDiff,
     max_positions_per_attributes: Option<u32>,
-    exact_attributes: HashSet<FieldId>,
-    proximity_precision: ProximityPrecision,
-    embedders: EmbeddingConfigs,
 ) -> Result<()> {
     puffin::profile_function!();
 
@@ -67,8 +59,7 @@ pub(crate) fn data_from_obkv_documents(
                         original_documents_chunk,
                         indexer,
                         lmdb_writer_sx.clone(),
-                        field_id_map.clone(),
-                        embedders.clone(),
+                        settings_diff,
                     )
                 })
                 .collect::<Result<()>>()
@@ -81,13 +72,9 @@ pub(crate) fn data_from_obkv_documents(
                         flattened_obkv_chunks,
                         indexer,
                         lmdb_writer_sx.clone(),
-                        &searchable_fields,
-                        &faceted_fields,
                         primary_key_id,
                         geo_fields_ids,
-                        &stop_words,
-                        &allowed_separators,
-                        &dictionary,
+                        settings_diff,
                         max_positions_per_attributes,
                     )
                 })
@@ -100,13 +87,12 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             docid_word_positions_chunk.clone(),
                             indexer,
+                            settings_diff,
                             lmdb_writer_sx.clone(),
                             extract_fid_word_count_docids,
                             TypedChunk::FieldIdWordCountDocids,
                             "field-id-wordcount-docids",
                         );
-
-                        let exact_attributes = exact_attributes.clone();
                         run_extraction_task::<
                             _,
                             _,
@@ -118,10 +104,9 @@ pub(crate) fn data_from_obkv_documents(
                         >(
                             docid_word_positions_chunk.clone(),
                             indexer,
+                            settings_diff,
                             lmdb_writer_sx.clone(),
-                            move |doc_word_pos, indexer| {
-                                extract_word_docids(doc_word_pos, indexer, &exact_attributes)
-                            },
+                            extract_word_docids,
                             |(
                                 word_docids_reader,
                                 exact_word_docids_reader,
@@ -139,6 +124,7 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             docid_word_positions_chunk.clone(),
                             indexer,
+                            settings_diff,
                             lmdb_writer_sx.clone(),
                             extract_word_position_docids,
                             TypedChunk::WordPositionDocids,
@@ -152,6 +138,7 @@ pub(crate) fn data_from_obkv_documents(
                         >(
                             fid_docid_facet_strings_chunk.clone(),
                             indexer,
+                            settings_diff,
                             lmdb_writer_sx.clone(),
                             extract_facet_string_docids,
                             TypedChunk::FieldIdFacetStringDocids,
@@ -161,22 +148,22 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             fid_docid_facet_numbers_chunk.clone(),
                             indexer,
+                            settings_diff,
                             lmdb_writer_sx.clone(),
                             extract_facet_number_docids,
                             TypedChunk::FieldIdFacetNumberDocids,
                             "field-id-facet-number-docids",
                         );
 
-                        if proximity_precision == ProximityPrecision::ByWord {
-                            run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
-                                docid_word_positions_chunk.clone(),
-                                indexer,
-                                lmdb_writer_sx.clone(),
-                                extract_word_pair_proximity_docids,
-                                TypedChunk::WordPairProximityDocids,
-                                "word-pair-proximity-docids",
-                            );
-                        }
+                        run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
+                            docid_word_positions_chunk.clone(),
+                            indexer,
+                            settings_diff,
+                            lmdb_writer_sx.clone(),
+                            extract_word_pair_proximity_docids,
+                            TypedChunk::WordPairProximityDocids,
+                            "word-pair-proximity-docids",
+                        );
                     }
 
                     Ok(())
@@ -195,12 +182,17 @@ pub(crate) fn data_from_obkv_documents(
 fn run_extraction_task<FE, FS, M>(
     chunk: grenad::Reader<CursorClonableMmap>,
     indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     extract_fn: FE,
     serialize_fn: FS,
     name: &'static str,
 ) where
-    FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
+    FE: Fn(
+            grenad::Reader<CursorClonableMmap>,
+            GrenadParameters,
+            &InnerIndexSettingsDiff,
+        ) -> Result<M>
         + Sync
         + Send
         + 'static,
@@ -213,7 +205,7 @@ fn run_extraction_task<FE, FS, M>(
         let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
         let _entered = child_span.enter();
         puffin::profile_scope!("extract_multiple_chunks", name);
-        match extract_fn(chunk, indexer) {
+        match extract_fn(chunk, indexer, settings_diff) {
             Ok(chunk) => {
                 let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
             }
@@ -230,8 +222,7 @@ fn send_original_documents_data(
     original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    field_id_map: FieldsIdsMap,
-    embedders: EmbeddingConfigs,
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<()> {
     let original_documents_chunk =
         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@@ -306,13 +297,9 @@ fn send_and_extract_flattened_documents_data(
     flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    searchable_fields: &Option<HashSet<FieldId>>,
-    faceted_fields: &HashSet<FieldId>,
     primary_key_id: FieldId,
     geo_fields_ids: Option<(FieldId, FieldId)>,
-    stop_words: &Option<fst::Set<Vec<u8>>>,
-    allowed_separators: &Option<&[&str]>,
-    dictionary: &Option<&[&str]>,
+    settings_diff: &InnerIndexSettingsDiff,
     max_positions_per_attributes: Option<u32>,
 ) -> Result<(
     grenad::Reader<CursorClonableMmap>,
@@ -341,10 +328,7 @@ fn send_and_extract_flattened_documents_data(
                     extract_docid_word_positions(
                         flattened_documents_chunk.clone(),
                         indexer,
-                        searchable_fields,
-                        stop_words.as_ref(),
-                        *allowed_separators,
-                        *dictionary,
+                        settings_diff,
                         max_positions_per_attributes,
                     )?;
 
@@ -367,7 +351,7 @@ fn send_and_extract_flattened_documents_data(
                 } = extract_fid_docid_facet_values(
                     flattened_documents_chunk.clone(),
                     indexer,
-                    faceted_fields,
+                    settings_diff,
                     geo_fields_ids,
                 )?;
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index d534661da..6bc5b6ff9 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -253,27 +253,12 @@ where
             let number_of_documents = self.index.number_of_documents(self.wtxn)?;
             return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
         }
-        let output = self
+        let mut output = self
             .transform
             .take()
             .expect("Invalid document addition state")
             .output_from_sorter(self.wtxn, &self.progress)?;
 
-        let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
-        self.index.put_faceted_fields(self.wtxn, &new_facets)?;
-
-        // in case new fields were introduced we're going to recreate the searchable fields.
-        if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? {
-            // we can't keep references on the faceted fields while we update the index thus we need to own it.
-            let faceted_fields: Vec<String> =
-                faceted_fields.into_iter().map(str::to_string).collect();
-            self.index.put_all_searchable_fields_from_fields_ids_map(
-                self.wtxn,
-                &faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(),
-                &output.fields_ids_map,
-            )?;
-        }
-
         let indexed_documents = output.documents_count as u64;
         let number_of_documents = self.execute_raw(output)?;
 
@@ -296,16 +281,17 @@ where
 
         let TransformOutput {
             primary_key,
-            fields_ids_map,
+            settings_diff,
             field_distribution,
             documents_count,
             original_documents,
             flattened_documents,
         } = output;
 
-        // The fields_ids_map is put back to the store now so the rest of the transaction sees an
-        // up to date field map.
-        self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
+        // update the internal facet and searchable list,
+        // because they might have changed due to the nested documents flattening.
+        settings_diff.new.recompute_facets(self.wtxn, self.index)?;
+        settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
 
         let backup_pool;
         let pool = match self.indexer_config.thread_pool {
@@ -333,7 +319,7 @@ where
         ) = crossbeam_channel::unbounded();
 
         // get the primary key field id
-        let primary_key_id = fields_ids_map.id(&primary_key).unwrap();
+        let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
 
         // get searchable fields for word databases
         let searchable_fields =
@@ -400,8 +386,6 @@ where
 
         let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
 
-        let cloned_embedder = self.embedders.clone();
-
         let mut final_documents_ids = RoaringBitmap::new();
         let mut databases_seen = 0;
         let mut word_position_docids = None;
@@ -410,7 +394,6 @@ where
         let mut exact_word_docids = None;
         let mut chunk_accumulator = ChunkAccumulator::default();
         let mut dimension = HashMap::new();
-        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
 
         let current_span = tracing::Span::current();
 
@@ -428,10 +411,6 @@ where
                 let flattened_chunk_iter =
                     grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
 
-                let separators: Option<Vec<_>> =
-                    separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
-                let dictionary: Option<Vec<_>> =
-                    dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
                 let result = original_chunk_iter.and_then(|original_chunk| {
                     let flattened_chunk = flattened_chunk_iter?;
                     // extract all databases from the chunked obkv douments
@@ -440,18 +419,10 @@ where
                         flattened_chunk,
                         pool_params,
                         lmdb_writer_sx.clone(),
-                        searchable_fields,
-                        faceted_fields,
                         primary_key_id,
                         geo_fields_ids,
-                        field_id_map,
-                        stop_words,
-                        separators.as_deref(),
-                        dictionary.as_deref(),
+                        &settings_diff,
                         max_positions_per_attributes,
-                        exact_attributes,
-                        proximity_precision,
-                        cloned_embedder,
                     )
                 });
 
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 09bf94ace..003353793 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -21,14 +21,17 @@ use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
-use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
+use crate::update::del_add::{
+    del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd,
+};
 use crate::update::index_documents::GrenadParameters;
+use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result};
 
 pub struct TransformOutput {
     pub primary_key: String,
-    pub fields_ids_map: FieldsIdsMap,
+    pub settings_diff: InnerIndexSettingsDiff,
     pub field_distribution: FieldDistribution,
     pub documents_count: usize,
     pub original_documents: File,
@@ -282,7 +285,9 @@ impl<'a, 'i> Transform<'a, 'i> {
                     self.original_sorter
                         .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
                     let base_obkv = KvReader::new(base_obkv);
-                    if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
+                    if let Some(flattened_obkv) =
+                        Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
+                    {
                         // we recreate our buffer with the flattened documents
                         document_sorter_value_buffer.clear();
                         document_sorter_value_buffer.push(Operation::Addition as u8);
@@ -315,7 +320,9 @@ impl<'a, 'i> Transform<'a, 'i> {
                     .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
 
                 let flattened_obkv = KvReader::new(&obkv_buffer);
-                if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+                if let Some(obkv) =
+                    Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
+                {
                     document_sorter_value_buffer.clear();
                     document_sorter_value_buffer.push(Operation::Addition as u8);
                     into_del_add_obkv(
@@ -524,7 +531,9 @@ impl<'a, 'i> Transform<'a, 'i> {
 
         // flatten it and push it as to delete in the flattened_sorter
         let flattened_obkv = KvReader::new(base_obkv);
-        if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+        if let Some(obkv) =
+            Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
+        {
             // we recreate our buffer with the flattened documents
             document_sorter_value_buffer.clear();
             document_sorter_value_buffer.push(Operation::Deletion as u8);
@@ -541,8 +550,15 @@ impl<'a, 'i> Transform<'a, 'i> {
 
     // Flatten a document from the fields ids map contained in self and insert the new
     // created fields. Returns `None` if the document doesn't need to be flattened.
-    #[tracing::instrument(level = "trace", skip(self, obkv), target = "indexing::transform")]
-    fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
+    #[tracing::instrument(
+        level = "trace",
+        skip(obkv, fields_ids_map),
+        target = "indexing::transform"
+    )]
+    fn flatten_from_fields_ids_map(
+        obkv: &KvReader<FieldId>,
+        fields_ids_map: &mut FieldsIdsMap,
+    ) -> Result<Option<Vec<u8>>> {
         if obkv
             .iter()
             .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
@@ -563,7 +579,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         // all the raw values get inserted directly in the `key_value` vec.
         for (key, value) in obkv.iter() {
             if json_depth_checker::should_flatten_from_unchecked_slice(value) {
-                let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
+                let key = fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
                     field_id: key,
                     process: "Flatten from fields ids map.",
                 })?;
@@ -581,7 +597,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         // Once we have the flattened version we insert all the new generated fields_ids
         // (if any) in the fields ids map and serialize the value.
         for (key, value) in flattened.into_iter() {
-            let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
+            let fid = fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
             key_value.push((fid, value.into()));
         }
@@ -792,9 +808,18 @@ impl<'a, 'i> Transform<'a, 'i> {
             fst_new_external_documents_ids_builder.insert(key, value)
         })?;
 
+        let old_inner_settings = InnerIndexSettings::from_index(&self.index, wtxn)?;
+        let mut new_inner_settings = old_inner_settings.clone();
+        new_inner_settings.fields_ids_map = self.fields_ids_map;
+        let settings_diff = InnerIndexSettingsDiff {
+            old: old_inner_settings,
+            new: new_inner_settings,
+            embedding_configs_updated: true,
+        };
+
         Ok(TransformOutput {
             primary_key,
-            fields_ids_map: self.fields_ids_map,
+            settings_diff,
             field_distribution,
             documents_count: self.documents_count,
             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
@@ -804,6 +829,38 @@ impl<'a, 'i> Transform<'a, 'i> {
         })
     }
 
+    fn rebind_existing_document(
+        old_obkv: KvReader<FieldId>,
+        settings_diff: &InnerIndexSettingsDiff,
+        original_obkv_buffer: &mut Vec<u8>,
+        flattened_obkv_buffer: &mut Vec<u8>,
+    ) -> Result<()> {
+        let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone();
+        let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone();
+        let mut obkv_writer = KvWriter::<_, FieldId>::memory();
+        // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
+        for (id, name) in new_fields_ids_map.iter() {
+            if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) {
+                obkv_writer.insert(id, val)?;
+            }
+        }
+        let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?);
+
+        // take the non-flattened version if flatten_from_fields_ids_map returns None.
+        let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?
+            .map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
+        let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?
+            .map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
+
+        original_obkv_buffer.clear();
+        flattened_obkv_buffer.clear();
+
+        del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?;
+        del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?;
+
+        Ok(())
+    }
+
     /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents
     /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
     ///
@@ -811,8 +868,7 @@ impl<'a, 'i> Transform<'a, 'i> {
     pub fn prepare_for_documents_reindexing(
         self,
         wtxn: &mut heed::RwTxn<'i>,
-        old_fields_ids_map: FieldsIdsMap,
-        mut new_fields_ids_map: FieldsIdsMap,
+        settings_diff: InnerIndexSettingsDiff,
     ) -> Result<TransformOutput> {
         // There already has been a document addition, the primary key should be set by now.
         let primary_key = self
@@ -848,78 +904,27 @@ impl<'a, 'i> Transform<'a, 'i> {
             self.indexer_settings.max_memory.map(|mem| mem / 2),
         );
 
-        let mut obkv_buffer = Vec::new();
+        let mut original_obkv_buffer = Vec::new();
+        let mut flattened_obkv_buffer = Vec::new();
         let mut document_sorter_key_buffer = Vec::new();
-        let mut document_sorter_value_buffer = Vec::new();
         for result in self.index.external_documents_ids().iter(wtxn)? {
             let (external_id, docid) = result?;
-            let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
+            let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
             )?;
 
-            obkv_buffer.clear();
-            let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
-
-            // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
-            for (id, name) in new_fields_ids_map.iter() {
-                if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) {
-                    obkv_writer.insert(id, val)?;
-                }
-            }
-
-            let buffer = obkv_writer.into_inner()?;
+            Self::rebind_existing_document(
+                old_obkv,
+                &settings_diff,
+                &mut original_obkv_buffer,
+                &mut flattened_obkv_buffer,
+            )?;
 
             document_sorter_key_buffer.clear();
             document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
             document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
-            document_sorter_value_buffer.clear();
-            into_del_add_obkv(
-                KvReaderU16::new(buffer),
-                DelAddOperation::DeletionAndAddition,
-                &mut document_sorter_value_buffer,
-            )?;
-            original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
-
-            // Once we have the document. We're going to flatten it
-            // and insert it in the flattened sorter.
-            let mut doc = serde_json::Map::new();
-
-            let reader = obkv::KvReader::new(buffer);
-            for (k, v) in reader.iter() {
-                let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
-                    field_id: k,
-                    process: "Accessing field distribution in transform.",
-                })?;
-                let value = serde_json::from_slice::<serde_json::Value>(v)
-                    .map_err(InternalError::SerdeJson)?;
-                doc.insert(key.to_string(), value);
-            }
-
-            let flattened = flatten_serde_json::flatten(&doc);
-
-            // Once we have the flattened version we can convert it back to obkv and
-            // insert all the new generated fields_ids (if any) in the fields ids map.
-            let mut buffer: Vec<u8> = Vec::new();
-            let mut writer = KvWriter::new(&mut buffer);
-            let mut flattened: Vec<_> = flattened.into_iter().collect();
-            // we reorder the field to get all the known field first
-            flattened.sort_unstable_by_key(|(key, _)| {
-                new_fields_ids_map.id(key).unwrap_or(FieldId::MAX)
-            });
-
-            for (key, value) in flattened {
-                let fid =
-                    new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
-                let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
-                writer.insert(fid, &value)?;
-            }
-            document_sorter_value_buffer.clear();
-            into_del_add_obkv(
-                KvReaderU16::new(&buffer),
-                DelAddOperation::DeletionAndAddition,
-                &mut document_sorter_value_buffer,
-            )?;
-            flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
+            original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
+            flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
         }
 
         let grenad_params = GrenadParameters {
@@ -934,19 +939,14 @@ impl<'a, 'i> Transform<'a, 'i> {
 
         let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
 
-        let output = TransformOutput {
+        Ok(TransformOutput {
             primary_key,
-            fields_ids_map: new_fields_ids_map,
             field_distribution,
+            settings_diff,
             documents_count,
             original_documents: original_documents.into_inner().into_inner(),
             flattened_documents: flattened_documents.into_inner().into_inner(),
-        };
-
-        let new_facets = output.compute_real_facets(wtxn, self.index)?;
-        self.index.put_faceted_fields(wtxn, &new_facets)?;
-
-        Ok(output)
+        })
     }
 }
 
@@ -961,20 +961,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
     vec.into_iter().map(|_| unreachable!()).collect()
 }
 
-impl TransformOutput {
-    // find and insert the new field ids
-    pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
-        let user_defined_facets = index.user_defined_faceted_fields(rtxn)?;
-
-        Ok(self
-            .fields_ids_map
-            .names()
-            .filter(|&field| crate::is_faceted(field, &user_defined_facets))
-            .map(|field| field.to_string())
-            .collect())
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 5b1788242..ae4304fce 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -385,14 +385,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
 
     #[tracing::instrument(
         level = "trace"
-        skip(self, progress_callback, should_abort, old_fields_ids_map),
+        skip(self, progress_callback, should_abort, settings_diff),
         target = "indexing::documents"
     )]
     fn reindex<FP, FA>(
         &mut self,
         progress_callback: &FP,
         should_abort: &FA,
-        old_fields_ids_map: FieldsIdsMap,
+        settings_diff: InnerIndexSettingsDiff,
     ) -> Result<()>
     where
         FP: Fn(UpdateIndexingStep) + Sync,
@@ -416,14 +416,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         )?;
 
         // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`.
-        let output = transform.prepare_for_documents_reindexing(
-            self.wtxn,
-            old_fields_ids_map,
-            fields_ids_map,
-        )?;
-
-        let embedder_configs = self.index.embedding_configs(self.wtxn)?;
-        let embedders = self.embedders(embedder_configs)?;
+        let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?;
 
         // We index the generated `TransformOutput` which must contain
         // all the documents with fields in the newly defined searchable order.
@@ -436,32 +429,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
             &should_abort,
         )?;
 
-        let indexing_builder = indexing_builder.with_embedders(embedders);
         indexing_builder.execute_raw(output)?;
 
         Ok(())
     }
 
-    fn embedders(
-        &self,
-        embedding_configs: Vec<(String, EmbeddingConfig)>,
-    ) -> Result<EmbeddingConfigs> {
-        let res: Result<_> = embedding_configs
-            .into_iter()
-            .map(|(name, EmbeddingConfig { embedder_options, prompt })| {
-                let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
-
-                let embedder = Arc::new(
-                    Embedder::new(embedder_options.clone())
-                        .map_err(crate::vector::Error::from)
-                        .map_err(crate::Error::from)?,
-                );
-                Ok((name, (embedder, prompt)))
-            })
-            .collect();
-        res.map(EmbeddingConfigs::new)
-    }
-
     fn update_displayed(&mut self) -> Result<bool> {
         match self.displayed_fields {
             Setting::Set(ref fields) => {
@@ -1067,7 +1039,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
 
         let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?;
-        let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
 
         // never trigger re-indexing
         self.update_displayed()?;
@@ -1109,47 +1080,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         };
 
         if inner_settings_diff.any_reindexing_needed() {
-            self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
+            self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
         }
 
         Ok(())
     }
-
-    fn update_faceted(
-        &self,
-        existing_fields: HashSet<String>,
-        old_faceted_fields: HashSet<String>,
-    ) -> Result<bool> {
-        if existing_fields.iter().any(|field| field.contains('.')) {
-            return Ok(true);
-        }
-
-        if old_faceted_fields.iter().any(|field| field.contains('.')) {
-            return Ok(true);
-        }
-
-        // If there is new faceted fields we indicate that we must reindex as we must
-        // index new fields as facets. It means that the distinct attribute,
-        // an Asc/Desc criterion or a filtered attribute as be added or removed.
-        let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
-
-        if new_faceted_fields.iter().any(|field| field.contains('.')) {
-            return Ok(true);
-        }
-
-        let faceted_updated =
-            (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
-
-        Ok(faceted_updated)
-    }
 }
 
 pub(crate) struct InnerIndexSettingsDiff {
-    old: InnerIndexSettings,
-    new: InnerIndexSettings,
+    pub old: InnerIndexSettings,
+    pub new: InnerIndexSettings,
 
     // TODO: compare directly the embedders.
-    embedding_configs_updated: bool,
+    pub embedding_configs_updated: bool,
 }
 
 impl InnerIndexSettingsDiff {
@@ -1167,7 +1110,7 @@ impl InnerIndexSettingsDiff {
                 != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
             || self.old.allowed_separators != self.new.allowed_separators
             || self.old.dictionary != self.new.dictionary
-            || self.old.searchable_fields != self.new.searchable_fields
+            || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
             || self.old.exact_attributes != self.new.exact_attributes
             || self.old.proximity_precision != self.new.proximity_precision
     }
@@ -1207,33 +1150,38 @@ impl InnerIndexSettingsDiff {
     }
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub(crate) struct InnerIndexSettings {
-    stop_words: Option<fst::Set<Vec<u8>>>,
-    allowed_separators: Option<BTreeSet<String>>,
-    dictionary: Option<BTreeSet<String>>,
-    fields_ids_map: FieldsIdsMap,
-    faceted_fields: HashSet<FieldId>,
-    searchable_fields: Option<BTreeSet<FieldId>>,
-    exact_attributes: HashSet<FieldId>,
-    proximity_precision: ProximityPrecision,
-    embedding_configs: Vec<(String, crate::vector::EmbeddingConfig)>,
-    existing_fields: HashSet<String>,
+    pub stop_words: Option<fst::Set<Vec<u8>>>,
+    pub allowed_separators: Option<BTreeSet<String>>,
+    pub dictionary: Option<BTreeSet<String>>,
+    pub fields_ids_map: FieldsIdsMap,
+    pub user_defined_faceted_fields: HashSet<String>,
+    pub user_defined_searchable_fields: Option<Vec<String>>,
+    pub faceted_fields_ids: HashSet<FieldId>,
+    pub searchable_fields_ids: Option<Vec<FieldId>>,
+    pub exact_attributes: HashSet<FieldId>,
+    pub proximity_precision: ProximityPrecision,
+    pub embedding_configs: EmbeddingConfigs,
+    pub existing_fields: HashSet<String>,
 }
 
 impl InnerIndexSettings {
-    fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> {
+    pub fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> {
         let stop_words = index.stop_words(rtxn)?;
         let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
         let allowed_separators = index.allowed_separators(rtxn)?;
         let dictionary = index.dictionary(rtxn)?;
         let fields_ids_map = index.fields_ids_map(rtxn)?;
-        let searchable_fields = index.searchable_fields_ids(rtxn)?;
-        let searchable_fields = searchable_fields.map(|sf| sf.into_iter().collect());
-        let faceted_fields = index.faceted_fields_ids(rtxn)?;
+        let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
+        let user_defined_searchable_fields =
+            user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
+        let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
+        let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
+        let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
         let exact_attributes = index.exact_attributes_ids(rtxn)?;
         let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
-        let embedding_configs = index.embedding_configs(rtxn)?;
+        let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
         let existing_fields: HashSet<_> = index
             .field_distribution(rtxn)?
             .into_iter()
@@ -1245,14 +1193,65 @@ impl InnerIndexSettings {
             allowed_separators,
             dictionary,
             fields_ids_map,
-            faceted_fields,
-            searchable_fields,
+            user_defined_faceted_fields,
+            user_defined_searchable_fields,
+            faceted_fields_ids,
+            searchable_fields_ids,
             exact_attributes,
             proximity_precision,
             embedding_configs,
             existing_fields,
         })
     }
+
+    // find and insert the new field ids
+    pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
+        let new_facets = self
+            .fields_ids_map
+            .names()
+            .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
+            .map(|field| field.to_string())
+            .collect();
+        index.put_faceted_fields(wtxn, &new_facets)?;
+
+        self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?;
+        Ok(())
+    }
+
+    // find and insert the new field ids
+    pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
+        // in case new fields were introduced we're going to recreate the searchable fields.
+        if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() {
+            let searchable_fields =
+                searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
+            index.put_all_searchable_fields_from_fields_ids_map(
+                wtxn,
+                &searchable_fields,
+                &self.fields_ids_map,
+            )?;
+            let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
+            self.searchable_fields_ids = searchable_fields_ids;
+        }
+
+        Ok(())
+    }
+}
+
+fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
+    let res: Result<_> = embedding_configs
+        .into_iter()
+        .map(|(name, EmbeddingConfig { embedder_options, prompt })| {
+            let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
+
+            let embedder = Arc::new(
+                Embedder::new(embedder_options.clone())
+                    .map_err(crate::vector::Error::from)
+                    .map_err(crate::Error::from)?,
+            );
+            Ok((name, (embedder, prompt)))
+        })
+        .collect();
+    res.map(EmbeddingConfigs::new)
 }
 
 fn validate_prompt(

From 02c3d6b26546a9f6f6b4406b3d7d077316d800d9 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 3 Apr 2024 11:19:45 +0200
Subject: [PATCH 07/16] finish work

---
 .../extract/extract_docid_word_positions.rs   | 41 ++++++----
 .../extract/extract_vector_points.rs          | 58 ++++++++------
 .../extract/extract_word_docids.rs            | 22 ++---
 .../extract_word_pair_proximity_docids.rs     | 23 +++++-
 .../src/update/index_documents/extract/mod.rs | 80 ++++++++++---------
 milli/src/update/index_documents/mod.rs       | 16 +---
 milli/src/update/index_documents/transform.rs | 19 +++--
 milli/src/update/settings.rs                  | 39 ++++++---
 8 files changed, 171 insertions(+), 127 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index b1a6bb5a6..6cf7b3167 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -34,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
+    let force_reindexing = settings_diff.reindex_searchable();
 
     // initialize destination values.
     let mut documents_ids = RoaringBitmap::new();
@@ -54,12 +55,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let mut value_buffer = Vec::new();
 
     // initialize tokenizer.
-    // TODO: Fix ugly allocation
+    /// TODO: Fix ugly allocation
     let old_stop_words = settings_diff.old.stop_words.as_ref();
-    let old_separators: Option<Vec<_>> =
-        settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+    let old_separators: Option<Vec<_>> = settings_diff
+        .old
+        .allowed_separators
+        .as_ref()
+        .map(|s| s.iter().map(String::as_str).collect());
     let old_dictionary: Option<Vec<_>> =
-        settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
+        settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
     let mut del_builder = tokenizer_builder(
         old_stop_words,
         old_separators.as_deref(),
@@ -68,12 +72,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     );
     let del_tokenizer = del_builder.build();
 
-    // TODO: Fix ugly allocation
+    /// TODO: Fix ugly allocation
     let new_stop_words = settings_diff.new.stop_words.as_ref();
-    let new_separators: Option<Vec<_>> =
-        settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+    let new_separators: Option<Vec<_>> = settings_diff
+        .new
+        .allowed_separators
+        .as_ref()
+        .map(|s| s.iter().map(String::as_str).collect());
     let new_dictionary: Option<Vec<_>> =
-        settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
+        settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
     let mut add_builder = tokenizer_builder(
         new_stop_words,
         new_separators.as_deref(),
@@ -92,10 +99,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         let obkv = KvReader::<FieldId>::new(value);
 
         // if the searchable fields didn't change, skip the searchable indexing for this document.
-        if !searchable_fields_changed(
-            &KvReader::<FieldId>::new(value),
-            &settings_diff.new.searchable_fields_ids,
-        ) {
+        if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
             continue;
         }
 
@@ -180,8 +184,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
 /// Check if any searchable fields of a document changed.
 fn searchable_fields_changed(
     obkv: &KvReader<FieldId>,
-    searchable_fields: &Option<Vec<FieldId>>,
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> bool {
+    let searchable_fields = &settings_diff.new.searchable_fields_ids;
     for (field_id, field_bytes) in obkv.iter() {
         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
             let del_add = KvReaderDelAdd::new(field_bytes);
@@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>(
         // then we don't rerun the extraction.
         if !script_language.is_empty() {
             // build a new temporary tokenizer including the allow list.
-            // TODO: Fix ugly allocation
+            /// TODO: Fix ugly allocation
             let stop_words = settings.stop_words.as_ref();
-            let separators: Option<Vec<_>> =
-                settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
+            let separators: Option<Vec<_>> = settings
+                .allowed_separators
+                .as_ref()
+                .map(|s| s.iter().map(String::as_str).collect());
             let dictionary: Option<Vec<_>> =
-                settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
+                settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
             let mut builder =
                 tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
             let tokenizer = builder.build();
diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs
index 40b32bf9c..fc79a861f 100644
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -17,8 +17,9 @@ use crate::error::UserError;
 use crate::prompt::Prompt;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::try_split_at;
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::vector::Embedder;
-use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
+use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
 
 /// The length of the elements that are always in the buffer when inserting new values.
 const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@@ -71,12 +72,15 @@ impl VectorStateDelta {
 pub fn extract_vector_points<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
-    field_id_map: &FieldsIdsMap,
+    settings_diff: &InnerIndexSettingsDiff,
     prompt: &Prompt,
     embedder_name: &str,
 ) -> Result<ExtractedVectorPoints> {
     puffin::profile_function!();
 
+    let old_fields_ids_map = &settings_diff.old.fields_ids_map;
+    let new_fields_ids_map = &settings_diff.new.fields_ids_map;
+
     // (docid, _index) -> KvWriterDelAdd -> Vector
     let mut manual_vectors_writer = create_writer(
         indexer.chunk_compression_type,
@@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
         tempfile::tempfile()?,
     );
 
-    let vectors_fid = field_id_map.id("_vectors");
-
     let mut key_buffer = Vec::new();
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
@@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
         // lazily get it when needed
         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
 
-        let vectors_field = vectors_fid
-            .and_then(|vectors_fid| obkv.get(vectors_fid))
-            .map(KvReaderDelAdd::new)
-            .map(|obkv| to_vector_maps(obkv, document_id))
-            .transpose()?;
+        // the vector field id may have changed
+        let old_vectors_fid = old_fields_ids_map.id("_vectors");
+        // filter the old vector fid if the settings has been changed forcing reindexing.
+        let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
 
-        let (del_map, add_map) = vectors_field.unzip();
-        let del_map = del_map.flatten();
-        let add_map = add_map.flatten();
+        let new_vectors_fid = new_fields_ids_map.id("_vectors");
+        let vectors_field = {
+            let del = old_vectors_fid
+                .and_then(|vectors_fid| obkv.get(vectors_fid))
+                .map(KvReaderDelAdd::new)
+                .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
+                .transpose()?
+                .flatten();
+            let add = new_vectors_fid
+                .and_then(|vectors_fid| obkv.get(vectors_fid))
+                .map(KvReaderDelAdd::new)
+                .map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
+                .transpose()?
+                .flatten();
+            (del, add)
+        };
+
+        let (del_map, add_map) = vectors_field;
 
         let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
         let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
@@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
                     VectorStateDelta::NowGenerated(prompt.render(
                         obkv,
                         DelAdd::Addition,
-                        field_id_map,
+                        &new_fields_ids_map,
                     )?)
                 } else {
                     VectorStateDelta::NowRemoved
@@ -182,9 +198,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
 
                 if document_is_kept {
                     // Don't give up if the old prompt was failing
-                    let old_prompt =
-                        prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
-                    let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
+                    let old_prompt = prompt
+                        .render(obkv, DelAdd::Deletion, &old_fields_ids_map)
+                        .unwrap_or_default();
+                    let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
                     if old_prompt != new_prompt {
                         tracing::trace!(
                             "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
@@ -220,15 +237,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
     })
 }
 
-fn to_vector_maps(
-    obkv: KvReaderDelAdd,
-    document_id: impl Fn() -> Value,
-) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
-    let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
-    let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
-    Ok((del, add))
-}
-
 fn to_vector_map(
     obkv: KvReaderDelAdd,
     side: DelAdd,
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 2b1f02326..2be41bb86 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -121,16 +121,16 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         let (w, fid) = StrBEU16Codec::bytes_decode(key)
             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
 
-        if let Some(word) = word {
-            if word.as_str() != w {
-                docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
+        if let Some(current) = word.as_ref() {
+            if current != w {
+                docids_into_writers(&current, &deletions, &additions, &mut word_docids_writer)?;
                 docids_into_writers(
-                    &word,
+                    &current,
                     &exact_deletions,
                     &exact_additions,
                     &mut exact_word_docids_writer,
-                );
-                let word = Some(w.to_string());
+                )?;
+                word = Some(w.to_string());
                 // clear buffers
                 deletions.clear();
                 additions.clear();
@@ -138,7 +138,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
                 exact_additions.clear();
             }
         } else {
-            let word = Some(w.to_string());
+            word = Some(w.to_string());
         }
 
         // merge all deletions
@@ -169,13 +169,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
     }
 
     if let Some(word) = word {
-        docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
+        docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
         docids_into_writers(
             &word,
             &exact_deletions,
             &exact_additions,
             &mut exact_word_docids_writer,
-        );
+        )?;
     }
 
     Ok((
@@ -253,7 +253,7 @@ where
             CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
                 SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
             })?,
-        );
+        )?;
     }
     // additions:
     if !additions.is_empty() {
@@ -262,7 +262,7 @@ where
             CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
                 SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
             })?,
-        );
+        )?;
     }
 
     // insert everything in the same writer.
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index d86d09bc8..e185566ca 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -11,7 +11,7 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::proximity::{index_proximity, MAX_DISTANCE};
+use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{DocumentId, Result};
@@ -24,9 +24,20 @@ use crate::{DocumentId, Result};
 pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
-    _settings_diff: &InnerIndexSettingsDiff,
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
+    let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
+    let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
+
+    // early return if the data shouldn't be deleted nor created.
+    if !any_deletion && !any_addition {
+        return tempfile::tempfile()
+            .map_err(Into::into)
+            .map(BufReader::new)
+            .and_then(grenad::Reader::new)
+            .map_err(Into::into);
+    }
 
     let max_memory = indexer.max_memory_by_thread();
 
@@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 
         let (del, add): (Result<_>, Result<_>) = rayon::join(
             || {
+                if !any_deletion {
+                    return Ok(());
+                }
+
                 // deletions
                 if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
                     for (position, word) in KvReaderU16::new(deletion).iter() {
@@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
                 Ok(())
             },
             || {
+                if !any_addition {
+                    return Ok(());
+                }
+
                 // additions
                 if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
                     for (position, word) in KvReaderU16::new(addition).iter() {
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index a6b73efde..924561dea 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -9,7 +9,6 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
-use std::collections::HashSet;
 use std::fs::File;
 use std::io::BufReader;
 
@@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
 use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
 use super::{helpers, TypedChunk};
-use crate::proximity::ProximityPrecision;
 use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{FieldId, Result};
 
@@ -200,12 +198,14 @@ fn run_extraction_task<FE, FS, M>(
     M: Send,
 {
     let current_span = tracing::Span::current();
+    /// TODO: remove clone
+    let settings_diff = settings_diff.clone();
 
     rayon::spawn(move || {
         let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
         let _entered = child_span.enter();
         puffin::profile_scope!("extract_multiple_chunks", name);
-        match extract_fn(chunk, indexer, settings_diff) {
+        match extract_fn(chunk, indexer, &settings_diff) {
             Ok(chunk) => {
                 let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
             }
@@ -235,50 +235,54 @@ fn send_original_documents_data(
         .thread_name(|index| format!("embedding-request-{index}"))
         .build()?;
 
-    rayon::spawn(move || {
-        for (name, (embedder, prompt)) in embedders {
-            let result = extract_vector_points(
-                documents_chunk_cloned.clone(),
-                indexer,
-                &field_id_map,
-                &prompt,
-                &name,
-            );
-            match result {
-                Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
-                    let embeddings = match extract_embeddings(
+    if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
+        /// TODO: remove clone
+        let settings_diff = settings_diff.clone();
+        rayon::spawn(move || {
+            for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
+                let result = extract_vector_points(
+                    documents_chunk_cloned.clone(),
+                    indexer,
+                    &settings_diff,
+                    &prompt,
+                    &name,
+                );
+                match result {
+                    Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
+                        let embeddings = match extract_embeddings(
                         prompts,
                         indexer,
                         embedder.clone(),
                         &request_threads,
                     ) {
-                        Ok(results) => Some(results),
-                        Err(error) => {
-                            let _ = lmdb_writer_sx_cloned.send(Err(error));
-                            None
-                        }
-                    };
+                                Ok(results) => Some(results),
+                                Err(error) => {
+                                    let _ = lmdb_writer_sx_cloned.send(Err(error));
+                                    None
+                                }
+                            };
 
-                    if !(remove_vectors.is_empty()
-                        && manual_vectors.is_empty()
-                        && embeddings.as_ref().map_or(true, |e| e.is_empty()))
-                    {
-                        let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
-                            remove_vectors,
-                            embeddings,
-                            expected_dimension: embedder.dimensions(),
-                            manual_vectors,
-                            embedder_name: name,
-                        }));
+                        if !(remove_vectors.is_empty()
+                            && manual_vectors.is_empty()
+                            && embeddings.as_ref().map_or(true, |e| e.is_empty()))
+                        {
+                            let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
+                                remove_vectors,
+                                embeddings,
+                                expected_dimension: embedder.dimensions(),
+                                manual_vectors,
+                                embedder_name: name,
+                            }));
+                        }
+                    }
+
+                    Err(error) => {
+                        let _ = lmdb_writer_sx_cloned.send(Err(error));
                     }
                 }
-
-                Err(error) => {
-                    let _ = lmdb_writer_sx_cloned.send(Err(error));
-                }
             }
-        }
-    });
+        });
+    }
 
     // TODO: create a custom internal error
     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 6bc5b6ff9..c3b081c37 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -6,7 +6,6 @@ mod typed_chunk;
 
 use std::collections::{HashMap, HashSet};
 use std::io::{Read, Seek};
-use std::iter::FromIterator;
 use std::num::NonZeroU32;
 use std::result::Result as StdResult;
 
@@ -281,7 +280,7 @@ where
 
         let TransformOutput {
             primary_key,
-            settings_diff,
+            mut settings_diff,
             field_distribution,
             documents_count,
             original_documents,
@@ -319,13 +318,8 @@ where
         ) = crossbeam_channel::unbounded();
 
         // get the primary key field id
-        let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
+        let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
 
-        // get searchable fields for word databases
-        let searchable_fields =
-            self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
-        // get filterable fields for facet databases
-        let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
         // get the fid of the `_geo.lat` and `_geo.lng` fields.
         let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
 
@@ -348,12 +342,6 @@ where
             None => None,
         };
 
-        let stop_words = self.index.stop_words(self.wtxn)?;
-        let separators = self.index.allowed_separators(self.wtxn)?;
-        let dictionary = self.index.dictionary(self.wtxn)?;
-        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
-        let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
-
         let pool_params = GrenadParameters {
             chunk_compression_type: self.indexer_config.chunk_compression_type,
             chunk_compression_level: self.indexer_config.chunk_compression_level,
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 003353793..e82600683 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -1,12 +1,11 @@
 use std::borrow::Cow;
 use std::collections::btree_map::Entry as BEntry;
 use std::collections::hash_map::Entry as HEntry;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::{Read, Seek};
 
 use fxhash::FxHashMap;
-use heed::RoTxn;
 use itertools::Itertools;
 use obkv::{KvReader, KvReaderU16, KvWriter};
 use roaring::RoaringBitmap;
@@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> {
         let settings_diff = InnerIndexSettingsDiff {
             old: old_inner_settings,
             new: new_inner_settings,
-            embedding_configs_updated: true,
+            embedding_configs_updated: false,
+            settings_update_only: false,
         };
 
         Ok(TransformOutput {
@@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> {
                 obkv_writer.insert(id, val)?;
             }
         }
-        let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?);
+        let data = obkv_writer.into_inner()?;
+        let new_obkv = KvReader::<FieldId>::new(&data);
 
         // take the non-flattened version if flatten_from_fields_ids_map returns None.
-        let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?
-            .map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
-        let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?
-            .map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
+        let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
+        let old_flattened =
+            old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
+        let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
+        let new_flattened =
+            new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
 
         original_obkv_buffer.clear();
         flattened_obkv_buffer.clear();
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index ae4304fce..6c770c0a1 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
             }
             Setting::NotSet => false,
         };
+
+        // if any changes force a reindexing
+        // clear the vector database.
+        if update {
+            self.index.vector_arroy.clear(self.wtxn)?;
+        }
+
         Ok(update)
     }
 
@@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
             old: old_inner_settings,
             new: new_inner_settings,
             embedding_configs_updated,
+            settings_update_only: true,
         };
 
         if inner_settings_diff.any_reindexing_needed() {
@@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     }
 }
 
-pub(crate) struct InnerIndexSettingsDiff {
-    pub old: InnerIndexSettings,
-    pub new: InnerIndexSettings,
+#[derive(Clone)]
+pub struct InnerIndexSettingsDiff {
+    pub(crate) old: InnerIndexSettings,
+    pub(crate) new: InnerIndexSettings,
 
     // TODO: compare directly the embedders.
-    pub embedding_configs_updated: bool,
+    pub(crate) embedding_configs_updated: bool,
+
+    pub(crate) settings_update_only: bool,
 }
 
 impl InnerIndexSettingsDiff {
-    fn any_reindexing_needed(&self) -> bool {
+    pub fn any_reindexing_needed(&self) -> bool {
         self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
     }
 
-    fn reindex_searchable(&self) -> bool {
+    pub fn reindex_searchable(&self) -> bool {
         self.old
             .fields_ids_map
             .iter()
@@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff {
             || self.old.proximity_precision != self.new.proximity_precision
     }
 
-    fn reindex_facets(&self) -> bool {
-        let existing_fields = self.new.existing_fields;
+    pub fn reindex_facets(&self) -> bool {
+        let existing_fields = &self.new.existing_fields;
         if existing_fields.iter().any(|field| field.contains('.')) {
             return true;
         }
 
-        let old_faceted_fields = self.old.user_defined_faceted_fields;
+        let old_faceted_fields = &self.old.user_defined_faceted_fields;
         if old_faceted_fields.iter().any(|field| field.contains('.')) {
             return true;
         }
@@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff {
         // If there is new faceted fields we indicate that we must reindex as we must
         // index new fields as facets. It means that the distinct attribute,
         // an Asc/Desc criterion or a filtered attribute as be added or removed.
-        let new_faceted_fields = self.new.user_defined_faceted_fields;
+        let new_faceted_fields = &self.new.user_defined_faceted_fields;
         if new_faceted_fields.iter().any(|field| field.contains('.')) {
             return true;
         }
 
         let faceted_updated =
-            (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
+            (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
 
         self.old
             .fields_ids_map
@@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff {
             || faceted_updated
     }
 
-    fn reindex_vectors(&self) -> bool {
+    pub fn reindex_vectors(&self) -> bool {
         self.embedding_configs_updated
     }
+
+    pub fn settings_update_only(&self) -> bool {
+        self.settings_update_only
+    }
 }
 
 #[derive(Clone)]

From a489b406b4b3ff60d3bd97edef3702e91e2767a6 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 3 Apr 2024 19:10:19 +0200
Subject: [PATCH 08/16] fix test

---
 meilisearch/tests/settings/get_settings.rs    |  4 +++-
 .../extract/extract_docid_word_positions.rs   |  3 ---
 .../src/update/index_documents/extract/mod.rs | 24 +++++++++----------
 milli/src/update/index_documents/mod.rs       |  2 +-
 milli/src/update/settings.rs                  |  1 -
 5 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs
index 980ef3064..042dcca41 100644
--- a/meilisearch/tests/settings/get_settings.rs
+++ b/meilisearch/tests/settings/get_settings.rs
@@ -113,7 +113,8 @@ async fn secrets_are_hidden_in_settings() {
                 "default": {
                     "source": "rest",
                     "url": "https://localhost:7777",
-                    "apiKey": "My super secret value you will never guess"
+                    "apiKey": "My super secret value you will never guess",
+                    "dimensions": 4,
                 }
             }
         }))
@@ -184,6 +185,7 @@ async fn secrets_are_hidden_in_settings() {
         "default": {
           "source": "rest",
           "apiKey": "My suXXXXXX...",
+          "dimensions": 4,
           "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
           "url": "https://localhost:7777",
           "query": null,
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 6cf7b3167..6af5bba6d 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -55,7 +55,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let mut value_buffer = Vec::new();
 
     // initialize tokenizer.
-    /// TODO: Fix ugly allocation
     let old_stop_words = settings_diff.old.stop_words.as_ref();
     let old_separators: Option<Vec<_>> = settings_diff
         .old
@@ -72,7 +71,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     );
     let del_tokenizer = del_builder.build();
 
-    /// TODO: Fix ugly allocation
     let new_stop_words = settings_diff.new.stop_words.as_ref();
     let new_separators: Option<Vec<_>> = settings_diff
         .new
@@ -267,7 +265,6 @@ fn lang_safe_tokens_from_document<'a>(
         // then we don't rerun the extraction.
         if !script_language.is_empty() {
             // build a new temporary tokenizer including the allow list.
-            /// TODO: Fix ugly allocation
             let stop_words = settings.stop_words.as_ref();
             let separators: Option<Vec<_>> = settings
                 .allowed_separators
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 924561dea..341cdc9f9 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -198,7 +198,6 @@ fn run_extraction_task<FE, FS, M>(
     M: Send,
 {
     let current_span = tracing::Span::current();
-    /// TODO: remove clone
     let settings_diff = settings_diff.clone();
 
     rayon::spawn(move || {
@@ -236,7 +235,6 @@ fn send_original_documents_data(
         .build()?;
 
     if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
-        /// TODO: remove clone
         let settings_diff = settings_diff.clone();
         rayon::spawn(move || {
             for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
@@ -250,17 +248,17 @@ fn send_original_documents_data(
                 match result {
                     Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
                         let embeddings = match extract_embeddings(
-                        prompts,
-                        indexer,
-                        embedder.clone(),
-                        &request_threads,
-                    ) {
-                                Ok(results) => Some(results),
-                                Err(error) => {
-                                    let _ = lmdb_writer_sx_cloned.send(Err(error));
-                                    None
-                                }
-                            };
+                            prompts,
+                            indexer,
+                            embedder.clone(),
+                            &request_threads,
+                        ) {
+                            Ok(results) => Some(results),
+                            Err(error) => {
+                                let _ = lmdb_writer_sx_cloned.send(Err(error));
+                                None
+                            }
+                        };
 
                         if !(remove_vectors.is_empty()
                             && manual_vectors.is_empty()
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index c3b081c37..47f1e9f19 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -252,7 +252,7 @@ where
             let number_of_documents = self.index.number_of_documents(self.wtxn)?;
             return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
         }
-        let mut output = self
+        let output = self
             .transform
             .take()
             .expect("Invalid document addition state")
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 6c770c0a1..ae9ae2801 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -400,7 +400,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     {
         puffin::profile_function!();
 
-        let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
         // if the settings are set before any document update, we don't need to do anything, and
         // will set the primary key during the first document addition.
         if self.index.number_of_documents(self.wtxn)? == 0 {

From bad46f88d606dc83de287291fa593e7b1ad6a035 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 4 Apr 2024 11:01:50 +0200
Subject: [PATCH 09/16] Fix embedder test

---
 index-scheduler/src/lib.rs                                      | 1 +
 .../test_settings_update/after_registering_settings_task.snap   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs
index 5901e45f8..5704f5354 100644
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -3041,6 +3041,7 @@ mod tests {
             source: Setting::Set(milli::vector::settings::EmbedderSource::Rest),
             api_key: Setting::Set(S("My super secret")),
             url: Setting::Set(S("http://localhost:7777")),
+            dimensions: Setting::Set(4),
             ..Default::default()
         };
         embedders.insert(S("default"), Setting::Set(embedding_settings));
diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
index 8c081b84b..205200965 100644
--- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
@@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
 []
 ----------------------------------------------------------------------
 ### All Tasks:
-0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
+0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
 ----------------------------------------------------------------------
 ### Status:
 enqueued [0,]

From e5ae337aae71354d338aa8475bc6a819a2f1af9c Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 4 Apr 2024 17:03:18 +0200
Subject: [PATCH 10/16] Comeback to sorters in extract_word_docids

	using buffers and merge the keys manually is less efficient
---
 .../extract/extract_word_docids.rs            | 79 ++++++-------------
 1 file changed, 26 insertions(+), 53 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 2be41bb86..5699f2fb6 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -14,6 +14,7 @@ use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::helpers::sorter_into_reader;
 use crate::update::settings::InnerIndexSettingsDiff;
 use crate::update::MergeFn;
 use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
@@ -45,7 +46,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory,
+        max_memory.map(|m| m / 3),
     );
     let mut key_buffer = Vec::new();
     let mut del_words = BTreeSet::new();
@@ -93,25 +94,27 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         tempfile::tempfile()?,
     );
 
-    let mut word_docids_writer = create_writer(
+    let mut word_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
-        tempfile::tempfile()?,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
     );
 
-    let mut exact_word_docids_writer = create_writer(
+    let mut exact_word_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
-        tempfile::tempfile()?,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
     );
 
-    let mut word: Option<String> = None;
-    let mut deletions = RoaringBitmap::new();
-    let mut additions = RoaringBitmap::new();
-    let mut exact_deletions = RoaringBitmap::new();
-    let mut exact_additions = RoaringBitmap::new();
     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
-    // TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
+    let mut buffer = Vec::new();
+    // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
     while let Some((key, value)) = iter.next()? {
         // only keep the value if their is a change to apply in the DB.
         if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
@@ -121,66 +124,36 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         let (w, fid) = StrBEU16Codec::bytes_decode(key)
             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
 
-        if let Some(current) = word.as_ref() {
-            if current != w {
-                docids_into_writers(&current, &deletions, &additions, &mut word_docids_writer)?;
-                docids_into_writers(
-                    &current,
-                    &exact_deletions,
-                    &exact_additions,
-                    &mut exact_word_docids_writer,
-                )?;
-                word = Some(w.to_string());
-                // clear buffers
-                deletions.clear();
-                additions.clear();
-                exact_deletions.clear();
-                exact_additions.clear();
-            }
-        } else {
-            word = Some(w.to_string());
-        }
-
         // merge all deletions
         let obkv = KvReaderDelAdd::new(value);
         if let Some(value) = obkv.get(DelAdd::Deletion) {
             let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
-            let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
-                SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
-            })?;
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            obkv.insert(DelAdd::Deletion, value)?;
             if delete_from_exact {
-                exact_deletions |= docids;
+                exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
             } else {
-                deletions |= docids
+                word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
             }
         }
         // merge all additions
         if let Some(value) = obkv.get(DelAdd::Addition) {
             let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
-            let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
-                SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
-            })?;
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            obkv.insert(DelAdd::Addition, value)?;
             if add_in_exact {
-                exact_additions |= docids;
+                exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
             } else {
-                additions |= docids
+                word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
             }
         }
     }
 
-    if let Some(word) = word {
-        docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
-        docids_into_writers(
-            &word,
-            &exact_deletions,
-            &exact_additions,
-            &mut exact_word_docids_writer,
-        )?;
-    }
-
     Ok((
-        writer_into_reader(word_docids_writer)?,
-        writer_into_reader(exact_word_docids_writer)?,
+        sorter_into_reader(word_docids_sorter, indexer)?,
+        sorter_into_reader(exact_word_docids_sorter, indexer)?,
         writer_into_reader(word_fid_docids_writer)?,
     ))
 }

From 5ab901dd30da4cfd4d474dfd0e6793eea5ec4c9c Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 4 Apr 2024 17:23:36 +0200
Subject: [PATCH 11/16] Fix tests

---
 .../src/snapshots/index_scheduler__tests__settings_update.snap   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap
index 85f0926b9..72a25f915 100644
--- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap
+++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap
@@ -7,6 +7,7 @@ expression: task.details
     "default": {
       "source": "rest",
       "apiKey": "MyXXXX...",
+      "dimensions": 4,
       "url": "http://localhost:7777"
     }
   }

From eaf113ef34c84e213dfcc96cdc2bc08837a45390 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 15 Apr 2024 11:28:30 +0200
Subject: [PATCH 12/16] Fix wod pair proximity error when nothing has to be
 extracted

---
 .../extract/extract_word_pair_proximity_docids.rs     | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index e185566ca..23f70ccd2 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -32,11 +32,12 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 
     // early return if the data shouldn't be deleted nor created.
     if !any_deletion && !any_addition {
-        return tempfile::tempfile()
-            .map_err(Into::into)
-            .map(BufReader::new)
-            .and_then(grenad::Reader::new)
-            .map_err(Into::into);
+        let writer = create_writer(
+            indexer.chunk_compression_type,
+            indexer.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
+        return writer_into_reader(writer);
     }
 
     let max_memory = indexer.max_memory_by_thread();

From 87a93ba47db3f1dc3b916bd686488306a4b489e5 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 15 Apr 2024 13:30:41 +0200
Subject: [PATCH 13/16] fix clippy

---
 .../update/index_documents/extract/extract_vector_points.rs | 6 +++---
 milli/src/update/index_documents/transform.rs               | 2 +-
 milli/src/update/settings.rs                                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs
index fc79a861f..23f945c7a 100644
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -171,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
                     VectorStateDelta::NowGenerated(prompt.render(
                         obkv,
                         DelAdd::Addition,
-                        &new_fields_ids_map,
+                        new_fields_ids_map,
                     )?)
                 } else {
                     VectorStateDelta::NowRemoved
@@ -199,9 +199,9 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
                 if document_is_kept {
                     // Don't give up if the old prompt was failing
                     let old_prompt = prompt
-                        .render(obkv, DelAdd::Deletion, &old_fields_ids_map)
+                        .render(obkv, DelAdd::Deletion, old_fields_ids_map)
                         .unwrap_or_default();
-                    let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
+                    let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
                     if old_prompt != new_prompt {
                         tracing::trace!(
                             "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index e82600683..90c3dbcc0 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -807,7 +807,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             fst_new_external_documents_ids_builder.insert(key, value)
         })?;
 
-        let old_inner_settings = InnerIndexSettings::from_index(&self.index, wtxn)?;
+        let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?;
         let mut new_inner_settings = old_inner_settings.clone();
         new_inner_settings.fields_ids_map = self.fields_ids_map;
         let settings_diff = InnerIndexSettingsDiff {
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index ae9ae2801..8ded6f03c 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1044,7 +1044,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     {
         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
 
-        let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?;
+        let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
 
         // never trigger re-indexing
         self.update_displayed()?;
@@ -1078,7 +1078,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
         // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
         let embedding_configs_updated = self.update_embedding_configs()?;
 
-        let new_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?;
+        let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
         let inner_settings_diff = InnerIndexSettingsDiff {
             old: old_inner_settings,
             new: new_inner_settings,

From a1ea224da97753e32edfe10eb4cffa203a3180f3 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 16 Apr 2024 14:51:21 +0200
Subject: [PATCH 14/16] Fix tests

---
 .../snapshots/index_scheduler__tests__settings_update-2.snap    | 1 +
 .../snapshots/index_scheduler__tests__settings_update-3.snap    | 2 +-
 .../test_settings_update/after_registering_settings_task.snap   | 2 +-
 .../lib.rs/test_settings_update/settings_update_processed.snap  | 2 +-
 meilisearch/tests/settings/get_settings.rs                      | 1 +
 5 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap
index 85f0926b9..72a25f915 100644
--- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap
+++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap
@@ -7,6 +7,7 @@ expression: task.details
     "default": {
       "source": "rest",
       "apiKey": "MyXXXX...",
+      "dimensions": 4,
       "url": "http://localhost:7777"
     }
   }
diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap
index 50a42d678..f7ae1c00a 100644
--- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap
+++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap
@@ -6,7 +6,7 @@ expression: embedding_config.embedder_options
   "Rest": {
     "api_key": "My super secret",
     "distribution": null,
-    "dimensions": null,
+    "dimensions": 4,
     "url": "http://localhost:7777",
     "query": null,
     "input_field": [
diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
index 205200965..f3b94fb3c 100644
--- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap
@@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
 []
 ----------------------------------------------------------------------
 ### All Tasks:
-0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
+0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
 ----------------------------------------------------------------------
 ### Status:
 enqueued [0,]
diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap
index f6fb6a186..830331f61 100644
--- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap
+++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap
@@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
 []
 ----------------------------------------------------------------------
 ### All Tasks:
-0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
+0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
 ----------------------------------------------------------------------
 ### Status:
 enqueued []
diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs
index 042dcca41..cd31d4959 100644
--- a/meilisearch/tests/settings/get_settings.rs
+++ b/meilisearch/tests/settings/get_settings.rs
@@ -213,6 +213,7 @@ async fn secrets_are_hidden_in_settings() {
         "default": {
           "source": "rest",
           "apiKey": "My suXXXXXX...",
+          "dimensions": 4,
           "url": "https://localhost:7777"
         }
       }

From 3acfab2eb784526c43beae1f8192bc9b2b4816be Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 10:54:48 +0200
Subject: [PATCH 15/16] Fix PR comments

---
 milli/src/update/index_documents/extract/mod.rs | 7 ++++---
 milli/src/update/index_documents/mod.rs         | 3 +++
 milli/src/update/index_documents/transform.rs   | 3 +++
 milli/src/update/settings.rs                    | 1 -
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 341cdc9f9..bf533cfc9 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -11,6 +11,7 @@ mod extract_word_position_docids;
 
 use std::fs::File;
 use std::io::BufReader;
+use std::sync::Arc;
 
 use crossbeam_channel::Sender;
 use rayon::prelude::*;
@@ -43,7 +44,7 @@ pub(crate) fn data_from_obkv_documents(
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     primary_key_id: FieldId,
     geo_fields_ids: Option<(FieldId, FieldId)>,
-    settings_diff: &InnerIndexSettingsDiff,
+    settings_diff: &Arc<InnerIndexSettingsDiff>,
     max_positions_per_attributes: Option<u32>,
 ) -> Result<()> {
     puffin::profile_function!();
@@ -180,7 +181,7 @@ pub(crate) fn data_from_obkv_documents(
 fn run_extraction_task<FE, FS, M>(
     chunk: grenad::Reader<CursorClonableMmap>,
     indexer: GrenadParameters,
-    settings_diff: &InnerIndexSettingsDiff,
+    settings_diff: &Arc<InnerIndexSettingsDiff>,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     extract_fn: FE,
     serialize_fn: FS,
@@ -221,7 +222,7 @@ fn send_original_documents_data(
     original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    settings_diff: &InnerIndexSettingsDiff,
+    settings_diff: &Arc<InnerIndexSettingsDiff>,
 ) -> Result<()> {
     let original_documents_chunk =
         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 47f1e9f19..070f31c73 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -8,6 +8,7 @@ use std::collections::{HashMap, HashSet};
 use std::io::{Read, Seek};
 use std::num::NonZeroU32;
 use std::result::Result as StdResult;
+use std::sync::Arc;
 
 use crossbeam_channel::{Receiver, Sender};
 use grenad::{Merger, MergerBuilder};
@@ -292,6 +293,8 @@ where
         settings_diff.new.recompute_facets(self.wtxn, self.index)?;
         settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
 
+        let settings_diff = Arc::new(settings_diff);
+
         let backup_pool;
         let pool = match self.indexer_config.thread_pool {
             Some(ref pool) => pool,
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 90c3dbcc0..8a3463e6f 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -829,6 +829,9 @@ impl<'a, 'i> Transform<'a, 'i> {
         })
     }
 
+    /// Rebind the field_ids of the provided document to their values
+    /// based on the field_ids_maps difference between the old and the new settings,
+    /// then fill the provided buffers with delta documents using KvWritterDelAdd.
     fn rebind_existing_document(
         old_obkv: KvReader<FieldId>,
         settings_diff: &InnerIndexSettingsDiff,
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 8ded6f03c..1997e966e 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1094,7 +1094,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
     }
 }
 
-#[derive(Clone)]
 pub struct InnerIndexSettingsDiff {
     pub(crate) old: InnerIndexSettings,
     pub(crate) new: InnerIndexSettings,

From df29ba709a1d016fb000182fce116457d5f5b403 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 12:33:25 +0200
Subject: [PATCH 16/16] Make some cleaning in Arcs

---
 .../src/update/index_documents/extract/mod.rs | 29 +++++++++----------
 milli/src/update/index_documents/mod.rs       |  2 +-
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index bf533cfc9..bc6fe2aff 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -44,7 +44,7 @@ pub(crate) fn data_from_obkv_documents(
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     primary_key_id: FieldId,
     geo_fields_ids: Option<(FieldId, FieldId)>,
-    settings_diff: &Arc<InnerIndexSettingsDiff>,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
     max_positions_per_attributes: Option<u32>,
 ) -> Result<()> {
     puffin::profile_function!();
@@ -58,7 +58,7 @@ pub(crate) fn data_from_obkv_documents(
                         original_documents_chunk,
                         indexer,
                         lmdb_writer_sx.clone(),
-                        settings_diff,
+                        settings_diff.clone(),
                     )
                 })
                 .collect::<Result<()>>()
@@ -73,7 +73,7 @@ pub(crate) fn data_from_obkv_documents(
                         lmdb_writer_sx.clone(),
                         primary_key_id,
                         geo_fields_ids,
-                        settings_diff,
+                        settings_diff.clone(),
                         max_positions_per_attributes,
                     )
                 })
@@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             docid_word_positions_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_fid_word_count_docids,
                             TypedChunk::FieldIdWordCountDocids,
@@ -103,7 +103,7 @@ pub(crate) fn data_from_obkv_documents(
                         >(
                             docid_word_positions_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_word_docids,
                             |(
@@ -123,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             docid_word_positions_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_word_position_docids,
                             TypedChunk::WordPositionDocids,
@@ -137,7 +137,7 @@ pub(crate) fn data_from_obkv_documents(
                         >(
                             fid_docid_facet_strings_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_facet_string_docids,
                             TypedChunk::FieldIdFacetStringDocids,
@@ -147,7 +147,7 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             fid_docid_facet_numbers_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_facet_number_docids,
                             TypedChunk::FieldIdFacetNumberDocids,
@@ -157,7 +157,7 @@ pub(crate) fn data_from_obkv_documents(
                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
                             docid_word_positions_chunk.clone(),
                             indexer,
-                            settings_diff,
+                            settings_diff.clone(),
                             lmdb_writer_sx.clone(),
                             extract_word_pair_proximity_docids,
                             TypedChunk::WordPairProximityDocids,
@@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents(
 fn run_extraction_task<FE, FS, M>(
     chunk: grenad::Reader<CursorClonableMmap>,
     indexer: GrenadParameters,
-    settings_diff: &Arc<InnerIndexSettingsDiff>,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     extract_fn: FE,
     serialize_fn: FS,
@@ -199,7 +199,6 @@ fn run_extraction_task<FE, FS, M>(
     M: Send,
 {
     let current_span = tracing::Span::current();
-    let settings_diff = settings_diff.clone();
 
     rayon::spawn(move || {
         let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
@@ -222,7 +221,7 @@ fn send_original_documents_data(
     original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    settings_diff: &Arc<InnerIndexSettingsDiff>,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
 ) -> Result<()> {
     let original_documents_chunk =
         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@@ -302,7 +301,7 @@ fn send_and_extract_flattened_documents_data(
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     primary_key_id: FieldId,
     geo_fields_ids: Option<(FieldId, FieldId)>,
-    settings_diff: &InnerIndexSettingsDiff,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
     max_positions_per_attributes: Option<u32>,
 ) -> Result<(
     grenad::Reader<CursorClonableMmap>,
@@ -331,7 +330,7 @@ fn send_and_extract_flattened_documents_data(
                     extract_docid_word_positions(
                         flattened_documents_chunk.clone(),
                         indexer,
-                        settings_diff,
+                        &settings_diff,
                         max_positions_per_attributes,
                     )?;
 
@@ -354,7 +353,7 @@ fn send_and_extract_flattened_documents_data(
                 } = extract_fid_docid_facet_values(
                     flattened_documents_chunk.clone(),
                     indexer,
-                    settings_diff,
+                    &settings_diff,
                     geo_fields_ids,
                 )?;
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 070f31c73..aa9789a1a 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -412,7 +412,7 @@ where
                         lmdb_writer_sx.clone(),
                         primary_key_id,
                         geo_fields_ids,
-                        &settings_diff,
+                        settings_diff.clone(),
                         max_positions_per_attributes,
                     )
                 });