Prefer using an explicit merge function name

This commit is contained in:
Kerollmops 2021-06-09 12:17:11 +02:00
parent 93978ec38a
commit cfc7314bd1
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
6 changed files with 51 additions and 98 deletions

View File

@ -1,71 +1,29 @@
use std::borrow::Cow;
use anyhow::{bail, ensure, Context};
use anyhow::bail;
use bstr::ByteSlice as _;
use fst::IntoStreamer;
use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
// Union of multiple FSTs
pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let fsts = values.iter().map(fst::Set::new).collect::<Result<Vec<_>, _>>()?;
let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect();
let op = op_builder.r#union();
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
match key {
WORDS_FST_KEY => {
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
// Union of the FSTs
let mut op = fst::set::OpBuilder::new();
fsts.iter().for_each(|fst| op.push(fst.into_stream()));
let op = op.r#union();
let mut build = fst::SetBuilder::memory();
build.extend_stream(op.into_stream()).unwrap();
Ok(build.into_inner().unwrap())
},
FIELDS_IDS_MAP_KEY => {
ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match");
Ok(values[0].to_vec())
},
DOCUMENTS_IDS_KEY => roaring_bitmap_merge(values),
otherwise => bail!("wut {:?}", otherwise),
}
}
pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
roaring_bitmap_merge(values)
let mut build = fst::SetBuilder::memory();
build.extend_stream(op.into_stream()).unwrap();
Ok(build.into_inner().unwrap())
}
pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
bail!("merging docid word positions is an error ({:?})", key.as_bstr())
panic!("merging docid word positions is an error ({:?})", key.as_bstr())
}
pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let first = values.first().context("no value to merge")?;
ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging");
Ok(first.to_vec())
}
pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
Ok(values.first().unwrap().to_vec())
}
pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
@ -88,7 +46,7 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu
writer.finish().unwrap();
}
fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let mut head = RoaringBitmap::deserialize_from(&head[..])?;
@ -102,7 +60,7 @@ fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
Ok(vec)
}
fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;

View File

@ -25,11 +25,8 @@ use crate::update::{
};
use self::store::{Store, Readers};
pub use self::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
docid_word_positions_merge, documents_merge,
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
field_id_word_count_docids_merge,
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge,
docid_word_positions_merge, documents_merge, keep_first
};
pub use self::transform::{Transform, TransformOutput};
@ -539,22 +536,22 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
debug!("Merging the main, word docids and words pairs proximity docids in parallel...");
rayon::spawn(move || {
vec![
(DatabaseType::Main, main_readers, main_merge as MergeFn),
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
(DatabaseType::Main, main_readers, fst_merge as MergeFn),
(DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge),
(
DatabaseType::FacetLevel0NumbersDocids,
facet_field_numbers_docids_readers,
facet_field_value_docids_merge,
cbo_roaring_bitmap_merge,
),
(
DatabaseType::WordLevel0PositionDocids,
word_level_position_docids_readers,
word_level_position_docids_merge,
cbo_roaring_bitmap_merge,
),
(
DatabaseType::FieldIdWordCountDocids,
field_id_word_count_docids_readers,
field_id_word_count_docids_merge,
cbo_roaring_bitmap_merge,
),
]
.into_par_iter()
@ -657,7 +654,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.facet_id_string_docids.as_polymorph(),
facet_field_strings_docids_readers,
facet_field_value_docids_merge,
cbo_roaring_bitmap_merge,
write_method,
)?;
@ -672,7 +669,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.field_id_docid_facet_f64s.as_polymorph(),
field_id_docid_facet_numbers_readers,
field_id_docid_facet_values_merge,
keep_first,
write_method,
)?;
@ -687,7 +684,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.field_id_docid_facet_strings.as_polymorph(),
field_id_docid_facet_strings_readers,
field_id_docid_facet_values_merge,
keep_first,
write_method,
)?;
@ -702,7 +699,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.word_pair_proximity_docids.as_polymorph(),
words_pairs_proximities_docids_readers,
words_pairs_proximities_docids_merge,
cbo_roaring_bitmap_merge,
write_method,
)?;
@ -721,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
self.index.main,
content,
main_merge,
fst_merge,
WriteMethod::GetMergePut,
)?;
},
@ -732,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
db,
content,
word_docids_merge,
roaring_bitmap_merge,
write_method,
)?;
},
@ -743,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
db,
content,
facet_field_value_docids_merge,
cbo_roaring_bitmap_merge,
write_method,
)?;
},
@ -754,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
db,
content,
field_id_word_count_docids_merge,
cbo_roaring_bitmap_merge,
write_method,
)?;
},
@ -765,7 +762,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
db,
content,
word_level_position_docids_merge,
cbo_roaring_bitmap_merge,
write_method,
)?;
}

View File

@ -26,11 +26,7 @@ use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
word_level_position_docids_merge, facet_field_value_docids_merge,
field_id_docid_facet_values_merge, field_id_word_count_docids_merge,
};
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_KILOBYTE: usize = 1024 * 1024;
@ -104,7 +100,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
let main_sorter = create_sorter(
main_merge,
fst_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -112,7 +108,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let word_docids_sorter = create_sorter(
word_docids_merge,
roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -120,7 +116,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let words_pairs_proximities_docids_sorter = create_sorter(
words_pairs_proximities_docids_merge,
cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -128,7 +124,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let word_level_position_docids_sorter = create_sorter(
word_level_position_docids_merge,
cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -136,7 +132,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let field_id_word_count_docids_sorter = create_sorter(
field_id_word_count_docids_merge,
cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -144,7 +140,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let facet_field_numbers_docids_sorter = create_sorter(
facet_field_value_docids_merge,
cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -152,7 +148,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let facet_field_strings_docids_sorter = create_sorter(
facet_field_value_docids_merge,
cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -160,7 +156,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let field_id_docid_facet_numbers_sorter = create_sorter(
field_id_docid_facet_values_merge,
keep_first,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -168,7 +164,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Some(1024 * 1024 * 1024), // 1MB
);
let field_id_docid_facet_strings_sorter = create_sorter(
field_id_docid_facet_values_merge,
keep_first,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,

View File

@ -6,7 +6,9 @@ use grenad::CompressionType;
use heed::types::ByteSlice;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
use crate::update::index_documents::{
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database,
};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -40,7 +42,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// It is forbidden to keep a mutable reference into the database
// and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter(
word_docids_merge,
roaring_bitmap_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
@ -66,7 +68,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
self.wtxn,
*self.index.word_prefix_docids.as_polymorph(),
prefix_docids_sorter,
word_docids_merge,
roaring_bitmap_merge,
WriteMethod::Append,
)?;

View File

@ -11,7 +11,7 @@ use crate::Index;
use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{
WriteMethod, create_sorter, sorter_into_lmdb_database,
words_pairs_proximities_docids_merge,
cbo_roaring_bitmap_merge,
};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
@ -50,7 +50,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
// Here we create a sorter akin to the previous one.
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
words_pairs_proximities_docids_merge,
cbo_roaring_bitmap_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
@ -80,7 +80,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
word_prefix_pair_proximity_docids_sorter,
words_pairs_proximities_docids_merge,
cbo_roaring_bitmap_merge,
WriteMethod::Append,
)?;

View File

@ -15,7 +15,7 @@ use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
cbo_roaring_bitmap_merge, sorter_into_lmdb_database
};
use crate::{Index, TreeLevel};
@ -86,7 +86,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
let mut word_prefix_level_positions_docids_sorter = create_sorter(
word_prefix_level_positions_docids_merge,
cbo_roaring_bitmap_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
@ -119,7 +119,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(),
word_prefix_level_positions_docids_sorter,
word_prefix_level_positions_docids_merge,
cbo_roaring_bitmap_merge,
WriteMethod::Append,
)?;