Merge pull request #87 from meilisearch/roaring-bitmap-length

Introduce fast methods to get roaring bitmap lengths
This commit is contained in:
Clément Renault 2021-02-18 14:52:40 +01:00 committed by GitHub
commit 09ca5d14c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 149 additions and 9 deletions

View File

@ -1,14 +1,12 @@
mod beu32_str_codec;
mod bo_roaring_bitmap_codec;
mod cbo_roaring_bitmap_codec;
mod obkv_codec;
mod roaring_bitmap_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
mod str_str_u8_codec;
pub mod facet;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
/// to determine the encoding used only by using the array of bytes length.
const THRESHOLD: usize = 7;
pub const THRESHOLD: usize = 7;
/// A conditionnal codec that either use the RoaringBitmap
/// or a lighter ByteOrder en/decoding method.

View File

@ -0,0 +1,7 @@
mod bo_roaring_bitmap_codec;
pub mod cbo_roaring_bitmap_codec;
mod roaring_bitmap_codec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;

View File

@ -0,0 +1,11 @@
use std::mem;
pub struct BoRoaringBitmapLenCodec;
impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
Some((bytes.len() / mem::size_of::<u32>()) as u64)
}
}

View File

@ -0,0 +1,22 @@
use std::mem;
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
pub struct CboRoaringBitmapLenCodec;
impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
// If there is threshold or less than threshold integers that can fit into this array
// of bytes it means that we used the ByteOrder codec serializer.
BoRoaringBitmapLenCodec::bytes_decode(bytes)
} else {
// Otherwise, it means we used the classic RoaringBitmapCodec and
// that the header takes threshold integers.
RoaringBitmapLenCodec::bytes_decode(bytes)
}
}
}

View File

@ -0,0 +1,7 @@
mod bo_roaring_bitmap_len_codec;
mod cbo_roaring_bitmap_len_codec;
mod roaring_bitmap_len_codec;
pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;

View File

@ -0,0 +1,83 @@
use std::io::{self, Read, BufRead};
use std::mem;
use byteorder::{ReadBytesExt, LittleEndian};
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
pub struct RoaringBitmapLenCodec;
impl RoaringBitmapLenCodec {
// FIXME should be exported in the RoaringBitmap crate
fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result<u64> {
let (size, has_offsets) = {
let cookie = bytes.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(bytes.read_u32::<LittleEndian>()? as usize, true)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(
io::ErrorKind::Other,
"run containers are unsupported",
));
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};
if size > u16::max_value() as usize + 1 {
return Err(io::Error::new(
io::ErrorKind::Other,
"size is greater than supported",
));
}
let mut description_bytes = vec![0u8; size * 4];
bytes.read_exact(&mut description_bytes)?;
let description_bytes = &mut &description_bytes[..];
if has_offsets {
bytes.consume(size * 4);
}
let mut length = 0;
for _ in 0..size {
let _key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
length += len;
if len <= 4096 {
bytes.consume(len as usize * mem::size_of::<u16>());
} else {
bytes.consume(1024 * mem::size_of::<u64>())
}
}
Ok(length)
}
}
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::heed_codec::RoaringBitmapCodec;
use heed::BytesEncode;
use roaring::RoaringBitmap;
#[test]
fn deserialize_roaring_bitmap_length() {
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
assert_eq!(bitmap.len(), len);
}
}

View File

@ -12,8 +12,8 @@ use crate::fields_ids_map::FieldsIdsMap;
use crate::{default_criteria, Criterion, Search, FacetDistribution};
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
use crate::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
BoRoaringBitmapCodec, CboRoaringBitmapCodec,
RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec,
StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
};
pub const CRITERIA_KEY: &str = "criteria";
@ -352,6 +352,17 @@ impl Index {
}
}
/* word documents count */
/// Returns the number of documents ids associated with the given word,
/// it is much faster than deserializing the bitmap and getting the length of it.
pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> anyhow::Result<Option<u64>> {
self.word_docids
.remap_data_type::<RoaringBitmapLenCodec>()
.get(rtxn, word)
.map_err(Into::into)
}
/* documents */
/// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.

View File

@ -26,6 +26,7 @@ pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
pub use self::update_store::UpdateStore;