From 379496233013830c5f22da373cdbd720c171d51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 13 Sep 2022 10:40:37 +0200 Subject: [PATCH] Use an unstable algorithm for grenad::Sorter when possible --- milli/Cargo.toml | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 1 + .../index_documents/extract/extract_facet_number_docids.rs | 1 + .../index_documents/extract/extract_facet_string_docids.rs | 1 + .../index_documents/extract/extract_fid_docid_facet_values.rs | 2 ++ .../index_documents/extract/extract_fid_word_count_docids.rs | 1 + milli/src/update/index_documents/extract/extract_word_docids.rs | 2 ++ .../extract/extract_word_pair_proximity_docids.rs | 1 + .../index_documents/extract/extract_word_position_docids.rs | 1 + milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 ++ milli/src/update/index_documents/mod.rs | 1 + milli/src/update/index_documents/transform.rs | 2 ++ milli/src/update/word_prefix_docids.rs | 1 + milli/src/update/words_prefix_position_docids.rs | 1 + 14 files changed, 18 insertions(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 016711198..c9853548c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" -grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 3cc842b00..e067623e2 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -32,6 +32,7 @@ pub fn extract_docid_word_positions( let mut documents_ids = RoaringBitmap::new(); let mut docid_word_positions_sorter = create_sorter( + grenad::SortAlgorithm::Stable, concat_u32s_array, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index fa63d9549..61157fa35 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -21,6 +21,7 @@ pub fn extract_facet_number_docids( let max_memory = indexer.max_memory_by_thread(); let mut facet_number_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 8209d817b..f7aa3730c 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -23,6 +23,7 @@ pub fn extract_facet_string_docids( let max_memory = indexer.max_memory_by_thread(); let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first_prefix_value_merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index cf116e6f5..f9d1443d5 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -28,6 +28,7 @@ pub fn extract_fid_docid_facet_values( let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -36,6 +37,7 @@ pub fn extract_fid_docid_facet_values( ); let mut fid_docid_facet_strings_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 85a65ee14..d425e8d14 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -25,6 +25,7 @@ pub fn extract_fid_word_count_docids( let max_memory = indexer.max_memory_by_thread(); let mut fid_word_count_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f3a44162b..4b965e9a8 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -30,6 +30,7 @@ pub fn extract_word_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -38,6 +39,7 @@ pub fn extract_word_docids( ); let mut exact_word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 5117bfaba..6add9d980 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -24,6 +24,7 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index a4720ba2b..c1661072a 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -21,6 +21,7 @@ pub fn extract_word_position_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 9d5a67d78..202e689f8 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -27,6 +27,7 @@ pub fn create_writer( } pub fn create_sorter( + sort_algorithm: grenad::SortAlgorithm, merge: MergeFn, chunk_compression_type: grenad::CompressionType, chunk_compression_level: Option, @@ -45,6 +46,7 @@ pub fn create_sorter( builder.dump_threshold(memory); builder.allow_realloc(false); } + builder.sort_algorithm(sort_algorithm); builder.build() } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 365b0d024..f69a4e893 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1489,6 +1489,7 @@ mod tests { assert_eq!(count, 4); } + #[cfg(feature = "default")] #[test] fn test_meilisearch_1714() { let index = TempIndex::new(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8818909a3..f52d5c7af 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -99,6 +99,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We initialize the sorter with the user indexing settings. let original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, @@ -108,6 +109,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We initialize the sorter with the user indexing settings. let flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 1002c13cf..976ff3dd0 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index b2b24084d..5dbc9f89b 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -65,6 +65,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); let mut prefix_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level,