Remove the useless threshold when computing the word prefix pair proximity

2025-06-29 01:48:30 +02:00 · 2022-01-12 15:23:46 +01:00 · 2022-01-12 15:23:46 +01:00 · 23ea3ad738
commit 23ea3ad738
parent e3c34684c6
1 changed files with 2 additions and 22 deletions
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) max_nb_chunks: Option<usize>,
    pub(crate) max_memory: Option<usize>,
-    threshold: u32,
 }

 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@ -33,21 +32,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
            chunk_compression_level: None,
            max_nb_chunks: None,
            max_memory: None,
-            threshold: 100,
        }
    }

-    /// Set the number of words required to make a prefix be part of the words prefixes
-    /// database. If a word prefix is supposed to match more than this number of words in the
-    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
-    ///
-    /// Default value is 100. This value must be higher than 50 and will be clamped
-    /// to these bound otherwise.
-    pub fn threshold(&mut self, value: u32) -> &mut Self {
-        self.threshold = value.max(50);
-        self
-    }
-
    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
    pub fn execute(self) -> Result<()> {
        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@ -81,7 +68,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                    write_prefixes_in_sorter(
                        &mut prefixes_cache,
                        &mut word_prefix_pair_proximity_docids_sorter,
-                        self.threshold,
                    )?;
                    prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
                }
@ -109,7 +95,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
        write_prefixes_in_sorter(
            &mut prefixes_cache,
            &mut word_prefix_pair_proximity_docids_sorter,
-            self.threshold,
        )?;

        drop(prefix_fst);
@ -131,15 +116,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
 fn write_prefixes_in_sorter(
    prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
    sorter: &mut grenad::Sorter<MergeFn>,
-    min_word_per_prefix: u32,
 ) -> Result<()> {
    for (key, data_slices) in prefixes.drain() {
-        // if the number of words prefixed by the prefix is higher than the threshold,
-        // we insert it in the sorter.
-        if data_slices.len() > min_word_per_prefix as usize {
-            for data in data_slices {
-                sorter.insert(&key, data)?;
-            }
+        for data in data_slices {
+            sorter.insert(&key, data)?;
        }
    }