Merge #708

708: Reduce memory usage of the MatchingWords structure r=ManyTheFish a=loiclec # Pull Request ## Related issue Fixes (partially) https://github.com/meilisearch/meilisearch/issues/3115 ## What does this PR do? 1. Reduces the memory usage caused by the creation of a 10-word query tree by 20x. This is done by deduplicating the `MatchingWord` values, which are heavy because of their inner DFA. The deduplication works by wrapping each `MatchingWord` in a reference-counted box and using a hash map to determine whether a `MatchingWord` DFA already exists for a certain signature, or whether a new one needs to be built. 2. Avoid the worst-case scenario of creating a `MatchingWord` for extremely long words that cannot be indexed by milli. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
2025-07-01 02:48:31 +02:00 · 2022-11-30 17:47:34 +00:00 · 2022-11-30 17:47:34 +00:00 · 5e754b3ee0
commit 5e754b3ee0
parent e1612fcb01 61b58b115a
9 changed files with 261 additions and 86 deletions
--- a/benchmarks/benches/formatting.rs
+++ b/benchmarks/benches/formatting.rs
@ -1,3 +1,5 @@
 use std::rc::Rc;
 use criterion::{criterion_group, criterion_main};
 use milli::tokenizer::TokenizerBuilder;
 use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
@ -18,14 +20,14 @@ fn bench_formatting(c: &mut criterion::Criterion) {
    		name: "'the door d'",
 			text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
 			matching_words: MatcherBuilder::new(MatchingWords::new(vec![
-	            (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]),
+	            (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]),
-	            (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
+	            (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]),
-	            (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]),
+	            (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]),
-	            (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]),
+	            (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]),
-	            (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]),
+	            (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]),
-	            (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]),
+	            (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]),
-	            (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]),
+	            (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
-	            (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]),
+	            (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
        	]
            ), TokenizerBuilder::default().build()),
 		},
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -70,6 +70,21 @@ pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
 /// expressed in term of latitude and longitude.
 pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
 /// The maximum length a LMDB key can be.
 ///
 /// Note that the actual allowed length is a little bit higher, but
 /// we keep a margin of safety.
 const MAX_LMDB_KEY_LENGTH: usize = 500;
 /// The maximum length a field value can be when inserted in an LMDB key.
 ///
 /// This number is determined by the keys of the different facet databases
 /// and adding a margin of safety.
 pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
 /// The maximum length a word can be
 pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
 pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
 // Convert an absolute word position into a relative position.
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@ -2,11 +2,13 @@ use std::cmp::{min, Reverse};
 use std::collections::BTreeMap;
 use std::fmt;
 use std::ops::{Index, IndexMut};
 use std::rc::Rc;
 use charabia::Token;
 use levenshtein_automata::{Distance, DFA};
 use crate::search::build_dfa;
 use crate::MAX_WORD_LENGTH;
 type IsPrefix = bool;
@ -14,11 +16,22 @@ type IsPrefix = bool;
 /// referencing words that match the given query tree.
 #[derive(Default)]
 pub struct MatchingWords {
-    inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+    inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
 }
 impl fmt::Debug for MatchingWords {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        writeln!(f, "[")?;
        for (matching_words, primitive_word_id) in self.inner.iter() {
            writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
        }
        writeln!(f, "]")?;
        Ok(())
    }
 }
 impl MatchingWords {
-    pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
+    pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
        // Sort word by len in DESC order prioritizing the longuest matches,
        // in order to highlight the longuest part of the matched word.
        matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
@ -35,7 +48,8 @@ impl MatchingWords {
 /// Iterator over terms that match the given token,
 /// This allow to lazily evaluate matches.
 pub struct MatchesIter<'a, 'b> {
-    inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
+    #[allow(clippy::type_complexity)]
    inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
    token: &'b Token<'b>,
 }
@ -91,10 +105,13 @@ impl PartialEq for MatchingWord {
 }
 impl MatchingWord {
-    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
+    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
        if word.len() > MAX_WORD_LENGTH {
            return None;
        }
        let dfa = build_dfa(&word, typo, prefix);
-        Self { dfa, word, typo, prefix }
+        Some(Self { dfa, word, typo, prefix })
    }
    /// Returns the lenght in chars of the match in case of the token matches the term.
@ -126,7 +143,7 @@ pub enum MatchType<'a> {
 /// Structure helper to match several tokens in a row in order to complete a partial match.
 #[derive(Debug, PartialEq)]
 pub struct PartialMatch<'a> {
-    matching_words: &'a [MatchingWord],
+    matching_words: &'a [Rc<MatchingWord>],
    ids: &'a [PrimitiveWordId],
    char_len: usize,
 }
@ -332,10 +349,15 @@ mod tests {
    #[test]
    fn matching_words() {
        let all = vec![
            Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
            Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
        ];
        let matching_words = vec![
-            (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
+            (vec![all[0].clone()], vec![0]),
-            (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
+            (vec![all[1].clone()], vec![1]),
-            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
+            (vec![all[2].clone()], vec![2]),
        ];
        let matching_words = MatchingWords::new(matching_words);
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
 #[cfg(test)]
 mod tests {
    use std::rc::Rc;
    use charabia::TokenizerBuilder;
    use super::*;
    use crate::search::matches::matching_words::MatchingWord;
    fn matching_words() -> MatchingWords {
        let all = vec![
            Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
        ];
        let matching_words = vec![
-            (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
+            (vec![all[0].clone()], vec![0]),
-            (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
+            (vec![all[1].clone()], vec![1]),
-            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
+            (vec![all[2].clone()], vec![2]),
        ];
        MatchingWords::new(matching_words)
@ -587,10 +594,11 @@ mod tests {
    #[test]
    fn highlight_unicode() {
-        let matching_words = vec![
+        let all = vec![
-            (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
+            Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
-            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
+            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
        ];
        let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
        let matching_words = MatchingWords::new(matching_words);
@ -823,24 +831,20 @@ mod tests {
    #[test]
    fn partial_matches() {
        let all = vec![
            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
            Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
        ];
        let matching_words = vec![
-            (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
+            (vec![all[0].clone()], vec![0]),
-            (
+            (vec![all[1].clone(), all[2].clone()], vec![0]),
-                vec![
+            (vec![all[3].clone()], vec![1]),
-                    MatchingWord::new("t".to_string(), 0, false),
+            (vec![all[4].clone(), all[5].clone()], vec![1]),
-                    MatchingWord::new("he".to_string(), 0, false),
+            (vec![all[4].clone()], vec![2]),
                ],
                vec![0],
            ),
            (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]),
            (
                vec![
                    MatchingWord::new("do".to_string(), 0, false),
                    MatchingWord::new("or".to_string(), 0, false),
                ],
                vec![1],
            ),
            (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]),
        ];
        let matching_words = MatchingWords::new(matching_words);
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -1,5 +1,9 @@
 use std::borrow::Cow;
 use std::cmp::max;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::hash::Hash;
 use std::rc::Rc;
 use std::{fmt, mem};
 use charabia::classifier::ClassifiedTokenIter;
@ -540,6 +544,29 @@ fn create_query_tree(
    Ok(Operation::or(true, operation_children))
 }
 #[derive(Default, Debug)]
 struct MatchingWordCache {
    all: Vec<Rc<MatchingWord>>,
    map: HashMap<(String, u8, bool), Rc<MatchingWord>>,
 }
 impl MatchingWordCache {
    fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option<Rc<MatchingWord>> {
        match self.map.entry((word.clone(), typo, prefix)) {
            Entry::Occupied(idx) => Some(idx.get().clone()),
            Entry::Vacant(vacant) => {
                let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
                self.all.push(matching_word.clone());
                vacant.insert(matching_word.clone());
                Some(matching_word)
            }
        }
        // To deactivate the cache, for testing purposes, use the following instead:
        // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
        // self.all.push(matching_word.clone());
        // Some(matching_word)
    }
 }
 /// Main function that matchings words used for crop and highlight.
 fn create_matching_words(
    ctx: &impl Context,
@ -551,7 +578,8 @@ fn create_matching_words(
        ctx: &impl Context,
        authorize_typos: bool,
        part: PrimitiveQueryPart,
-        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
        matching_word_cache: &mut MatchingWordCache,
        id: PrimitiveWordId,
    ) -> Result<()> {
        match part {
@ -560,19 +588,28 @@ fn create_matching_words(
            PrimitiveQueryPart::Word(word, prefix) => {
                if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
                    for synonym in synonyms {
-                        let synonym = synonym
+                        // Require that all words of the synonym have a corresponding MatchingWord
                        // before adding any of its words to the matching_words result.
                        if let Some(synonym_matching_words) = synonym
                            .into_iter()
-                            .map(|syn| MatchingWord::new(syn, 0, false))
+                            .map(|word| matching_word_cache.insert(word, 0, false))
-                            .collect();
+                            .collect()
-                        matching_words.push((synonym, vec![id]));
+                        {
                            matching_words.push((synonym_matching_words, vec![id]));
                        }
                    }
                }
                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
-                    let left = MatchingWord::new(left.to_string(), 0, false);
+                    // Require that both left and right words have a corresponding MatchingWord
-                    let right = MatchingWord::new(right.to_string(), 0, false);
+                    // before adding them to the matching_words result
                    if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) {
                        if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false)
                        {
                            matching_words.push((vec![left, right], vec![id]));
                        }
                    }
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
                let exact_words = ctx.exact_words();
@ -580,18 +617,29 @@ fn create_matching_words(
                    TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
                let matching_word = match typos(word, authorize_typos, config) {
-                    QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
+                    QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix),
-                    QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
+                    QueryKind::Tolerant { typo, word } => {
                        matching_word_cache.insert(word, typo, prefix)
                    }
                };
                if let Some(matching_word) = matching_word {
                    matching_words.push((vec![matching_word], vec![id]));
                }
            }
            // create a CONSECUTIVE matchings words wrapping all word in the phrase
            PrimitiveQueryPart::Phrase(words) => {
                let ids: Vec<_> =
                    (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
-                let words =
+                // Require that all words of the phrase have a corresponding MatchingWord
-                    words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect();
+                // before adding any of them to the matching_words result
-                matching_words.push((words, ids));
+                if let Some(phrase_matching_words) = words
                    .into_iter()
                    .flatten()
                    .map(|w| matching_word_cache.insert(w, 0, false))
                    .collect()
                {
                    matching_words.push((phrase_matching_words, ids));
                }
            }
        }
@ -603,7 +651,8 @@ fn create_matching_words(
        ctx: &impl Context,
        authorize_typos: bool,
        query: &[PrimitiveQueryPart],
-        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
        matching_word_cache: &mut MatchingWordCache,
        mut id: PrimitiveWordId,
    ) -> Result<()> {
        const MAX_NGRAM: usize = 3;
@ -621,6 +670,7 @@ fn create_matching_words(
                                authorize_typos,
                                part.clone(),
                                matching_words,
                                matching_word_cache,
                                id,
                            )?;
                        }
@ -643,13 +693,15 @@ fn create_matching_words(
                            if let Some(synonyms) = ctx.synonyms(&words)? {
                                for synonym in synonyms {
-                                    let synonym = synonym
+                                    if let Some(synonym) = synonym
                                        .into_iter()
-                                        .map(|syn| MatchingWord::new(syn, 0, false))
+                                        .map(|syn| matching_word_cache.insert(syn, 0, false))
-                                        .collect();
+                                        .collect()
                                    {
                                        matching_words.push((synonym, ids.clone()));
                                    }
                                }
                            }
                            let word = words.concat();
                            let (word_len_one_typo, word_len_two_typo) =
                                ctx.min_word_len_for_typo()?;
@ -662,18 +714,27 @@ fn create_matching_words(
                            };
                            let matching_word = match typos(word, authorize_typos, config) {
                                QueryKind::Exact { word, .. } => {
-                                    MatchingWord::new(word, 0, is_prefix)
+                                    matching_word_cache.insert(word, 0, is_prefix)
                                }
                                QueryKind::Tolerant { typo, word } => {
-                                    MatchingWord::new(word, typo, is_prefix)
+                                    matching_word_cache.insert(word, typo, is_prefix)
                                }
                            };
                            if let Some(matching_word) = matching_word {
                                matching_words.push((vec![matching_word], ids));
                            }
                        }
                    }
                    if !is_last {
-                        ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
+                        ngrams(
                            ctx,
                            authorize_typos,
                            tail,
                            matching_words,
                            matching_word_cache,
                            id + 1,
                        )?;
                    }
                }
            }
@ -683,8 +744,9 @@ fn create_matching_words(
        Ok(())
    }
    let mut matching_word_cache = MatchingWordCache::default();
    let mut matching_words = Vec::new();
-    ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
+    ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
    Ok(MatchingWords::new(matching_words))
 }
@ -814,6 +876,7 @@ mod test {
    use rand::{Rng, SeedableRng};
    use super::*;
    use crate::index::tests::TempIndex;
    use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
    #[derive(Debug)]
@ -1294,6 +1357,27 @@ mod test {
        );
    }
    #[test]
    fn test_dont_create_matching_word_for_long_words() {
        let index = TempIndex::new();
        let rtxn = index.read_txn().unwrap();
        let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house";
        let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
        builder.words_limit(10);
        let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap();
        insta::assert_snapshot!(format!("{matching_words:?}"), @r###"
        [
        ([MatchingWord { word: "house", typo: 1, prefix: true }], [3])
        ([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
        ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1])
        ([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
        ([MatchingWord { word: "house", typo: 1, prefix: true }], [1])
        ([MatchingWord { word: "what", typo: 0, prefix: false }], [0])
        ([MatchingWord { word: "a", typo: 0, prefix: false }], [1])
        ]
        "###);
    }
    #[test]
    fn disable_typo_on_word() {
        let query = "goodbye";
@ -1310,4 +1394,67 @@ mod test {
            Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
        ));
    }
    // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel,
    // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run
    // manually from time to time, so I kept it here, commented-out.
    // use std::alloc::{GlobalAlloc, System};
    // use std::sync::atomic::{self, AtomicI64};
    //
    // #[global_allocator]
    // static ALLOC: CountingAlloc =
    //     CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) };
    //
    // pub struct CountingAlloc {
    //     pub resident: AtomicI64,
    //     pub allocated: AtomicI64,
    // }
    // unsafe impl GlobalAlloc for CountingAlloc {
    //     unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
    //         self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
    //         self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
    //
    //         System.alloc(layout)
    //     }
    //
    //     unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
    //         self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
    //         System.dealloc(ptr, layout)
    //     }
    // }
    //
    // #[test]
    // fn memory_usage_of_ten_word_query() {
    //     let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst);
    //     let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst);
    //
    //     let index = TempIndex::new();
    //     let rtxn = index.read_txn().unwrap();
    //     let query = "a beautiful summer house by the beach overlooking what seems";
    //     let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
    //     builder.words_limit(10);
    //     let x = builder.build(query.tokenize()).unwrap().unwrap();
    //     let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst);
    //     let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst);
    //
    //     // Weak check on the memory usage
    //     // Don't keep more than 5MB. (Arguably 5MB is already too high)
    //     assert!(resident_after - resident_before < 5_000_000);
    //     // Don't allocate more than 10MB.
    //     assert!(allocated_after - allocated_before < 10_000_000);
    //
    //     // Use these snapshots to measure the exact memory usage.
    //     // The values below were correct at the time I wrote them.
    //     // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950");
    //     // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502");
    //
    //     // Note, with the matching word cache deactivated, the memory usage was:
    //     // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697");
    //     // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588");
    //     // or about 20x more resident memory (90MB vs 4.5MB)
    //
    //     // Use x
    //     let _x = x;
    // }
 }
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
 use roaring::RoaringBitmap;
 use serde_json::Value;
-use super::helpers::{
+use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
    concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH,
 };
 use crate::error::{InternalError, SerializationError};
-use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{
    absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
 };
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@ -6,9 +6,8 @@ use heed::BytesEncode;
 use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 use crate::heed_codec::StrRefCodec;
 use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
 use crate::update::index_documents::merge_cbo_roaring_bitmaps;
-use crate::{FieldId, Result};
+use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 /// Extracts the facet string and the documents ids where this facet string appear.
 ///
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@ -12,9 +12,8 @@ use serde_json::Value;
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
 use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
 use crate::update::index_documents::{create_writer, writer_into_reader};
-use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
 /// Extracts the facet values of each faceted field of each document.
 ///
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@ -18,20 +18,7 @@ pub use merge_functions::{
    serialize_roaring_bitmap, MergeFn,
 };
-/// The maximum length a LMDB key can be.
+use crate::MAX_WORD_LENGTH;
 ///
 /// Note that the actual allowed length is a little bit higher, but
 /// we keep a margin of safety.
 const MAX_LMDB_KEY_LENGTH: usize = 500;
 /// The maximum length a field value can be when inserted in an LMDB key.
 ///
 /// This number is determined by the keys of the different facet databases
 /// and adding a margin of safety.
 pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
 /// The maximum length a word can be
 pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
 pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
    key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()