635: Use an unstable algorithm for `grenad::Sorter` when possible r=Kerollmops a=loiclec

# Pull Request
## What does this PR do?

Use an unstable algorithm to sort the internal vector used by `grenad::Sorter` whenever possible to speed up indexing.

In practice, every time the merge function creates a `RoaringBitmap`, we use an unstable sort. For every other merge function, such as `keep_first`, `keep_last`, etc., a stable sort is used.


Co-authored-by: Loïc Lecrenier <loic@meilisearch.com>
This commit is contained in:
bors[bot] 2022-09-14 12:00:52 +00:00 committed by GitHub
commit 15d478cf4d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 18 additions and 1 deletions

View File

@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7" fst = "0.4.7"
fxhash = "0.2.1" fxhash = "0.2.1"
geoutils = "0.4.1" geoutils = "0.4.1"
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" } json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }

View File

@ -32,6 +32,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
let mut docid_word_positions_sorter = create_sorter( let mut docid_word_positions_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
concat_u32s_array, concat_u32s_array,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -21,6 +21,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut facet_number_docids_sorter = create_sorter( let mut facet_number_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -23,6 +23,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut facet_string_docids_sorter = create_sorter( let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
keep_first_prefix_value_merge_roaring_bitmaps, keep_first_prefix_value_merge_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -28,6 +28,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter( let mut fid_docid_facet_numbers_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
keep_first, keep_first,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
@ -36,6 +37,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
); );
let mut fid_docid_facet_strings_sorter = create_sorter( let mut fid_docid_facet_strings_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
keep_first, keep_first,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -25,6 +25,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut fid_word_count_docids_sorter = create_sorter( let mut fid_word_count_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -30,6 +30,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut word_docids_sorter = create_sorter( let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
@ -38,6 +39,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
); );
let mut exact_word_docids_sorter = create_sorter( let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -24,6 +24,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut word_pair_proximity_docids_sorter = create_sorter( let mut word_pair_proximity_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -21,6 +21,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut word_position_docids_sorter = create_sorter( let mut word_position_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,

View File

@ -27,6 +27,7 @@ pub fn create_writer<R: io::Write>(
} }
pub fn create_sorter( pub fn create_sorter(
sort_algorithm: grenad::SortAlgorithm,
merge: MergeFn, merge: MergeFn,
chunk_compression_type: grenad::CompressionType, chunk_compression_type: grenad::CompressionType,
chunk_compression_level: Option<u32>, chunk_compression_level: Option<u32>,
@ -45,6 +46,7 @@ pub fn create_sorter(
builder.dump_threshold(memory); builder.dump_threshold(memory);
builder.allow_realloc(false); builder.allow_realloc(false);
} }
builder.sort_algorithm(sort_algorithm);
builder.build() builder.build()
} }

View File

@ -1488,6 +1488,7 @@ mod tests {
assert_eq!(count, 4); assert_eq!(count, 4);
} }
#[cfg(feature = "default")]
#[test] #[test]
fn test_meilisearch_1714() { fn test_meilisearch_1714() {
let index = TempIndex::new(); let index = TempIndex::new();

View File

@ -99,6 +99,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// We initialize the sorter with the user indexing settings. // We initialize the sorter with the user indexing settings.
let original_sorter = create_sorter( let original_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_function, merge_function,
indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_type,
indexer_settings.chunk_compression_level, indexer_settings.chunk_compression_level,
@ -108,6 +109,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// We initialize the sorter with the user indexing settings. // We initialize the sorter with the user indexing settings.
let flattened_sorter = create_sorter( let flattened_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_function, merge_function,
indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_type,
indexer_settings.chunk_compression_level, indexer_settings.chunk_compression_level,

View File

@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// It is forbidden to keep a mutable reference into the database // It is forbidden to keep a mutable reference into the database
// and write into it at the same time, therefore we write into another file. // and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter( let mut prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,

View File

@ -65,6 +65,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
debug!("Computing and writing the word levels positions docids into LMDB on disk..."); debug!("Computing and writing the word levels positions docids into LMDB on disk...");
let mut prefix_position_docids_sorter = create_sorter( let mut prefix_position_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,