From 6cc6addc2fb157ffbb6ad2478957dd584fdf16fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 2 Oct 2020 17:06:17 +0200 Subject: [PATCH] Increase the CboRoaringBitmapCodec threshold --- src/heed_codec/cbo_roaring_bitmap_codec.rs | 40 +++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/heed_codec/cbo_roaring_bitmap_codec.rs b/src/heed_codec/cbo_roaring_bitmap_codec.rs index b9b802a38..31eb949b3 100644 --- a/src/heed_codec/cbo_roaring_bitmap_codec.rs +++ b/src/heed_codec/cbo_roaring_bitmap_codec.rs @@ -5,13 +5,18 @@ use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; +/// This is the limit where using a byteorder became less size efficient +/// than using a direct roaring encoding, it is also the point where we are able +/// to determine the encoding used only by using the array of bytes length. +const THRESHOLD: usize = 7; + /// A conditionnal codec that either use the RoaringBitmap /// or a lighter ByteOrder en/decoding method. pub struct CboRoaringBitmapCodec; impl CboRoaringBitmapCodec { pub fn serialized_size(roaring: &RoaringBitmap) -> usize { - if roaring.len() <= 4 { + if roaring.len() <= THRESHOLD as u64 { roaring.len() as usize * size_of::() } else { roaring.serialized_size() @@ -19,8 +24,8 @@ impl CboRoaringBitmapCodec { } pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) -> io::Result<()> { - if roaring.len() <= 4 { - // If the number of items (u32s) to encode is less than or equal to 4 + if roaring.len() <= THRESHOLD as u64 { + // If the number of items (u32s) to encode is less than or equal to the threshold // it means that it would weigh the same or less than the RoaringBitmap // header, so we directly encode them using ByteOrder instead. for integer in roaring { @@ -34,8 +39,8 @@ impl CboRoaringBitmapCodec { } pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { - if bytes.len() <= 4 * size_of::() { - // If there is 4 or less than 4 integers that can fit into this array + if bytes.len() <= THRESHOLD * size_of::() { + // If there is threshold or less than threshold integers that can fit into this array // of bytes it means that we used the ByteOrder codec serializer. let mut bitmap = RoaringBitmap::new(); while let Ok(integer) = bytes.read_u32::() { @@ -44,7 +49,7 @@ impl CboRoaringBitmapCodec { Ok(bitmap) } else { // Otherwise, it means we used the classic RoaringBitmapCodec and - // that the header takes 4 integers. + // that the header takes threshold integers. RoaringBitmap::deserialize_from(bytes) } } @@ -75,10 +80,29 @@ mod tests { use super::*; #[test] - fn limit_four() { - let input = RoaringBitmap::from_iter(vec![0, 1, 2, 3]); + fn verify_encoding_decoding() { + let input = RoaringBitmap::from_iter(0..THRESHOLD as u32); let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap(); let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap(); assert_eq!(input, output); } + + #[test] + fn verify_threshold() { + let input = RoaringBitmap::from_iter(0..THRESHOLD as u32); + + // use roaring bitmap + let mut bytes = Vec::new(); + input.serialize_into(&mut bytes).unwrap(); + let roaring_size = bytes.len(); + + // use byteorder directly + let mut bytes = Vec::new(); + for integer in input { + bytes.write_u32::(integer).unwrap(); + } + let bo_size = bytes.len(); + + assert!(roaring_size > bo_size); + } }