diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index e7b8cf256..a070c66eb 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,14 +1,12 @@ mod beu32_str_codec; -mod bo_roaring_bitmap_codec; -mod cbo_roaring_bitmap_codec; mod obkv_codec; -mod roaring_bitmap_codec; +mod roaring_bitmap; +mod roaring_bitmap_length; mod str_str_u8_codec; pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; -pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; -pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::obkv_codec::ObkvCodec; -pub use self::roaring_bitmap_codec::RoaringBitmapCodec; +pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; +pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs similarity index 100% rename from milli/src/heed_codec/bo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs diff --git a/milli/src/heed_codec/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs similarity index 99% rename from milli/src/heed_codec/cbo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 31eb949b3..8ccf831e3 100644 --- a/milli/src/heed_codec/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able /// to determine the encoding used only by using the array of bytes length. -const THRESHOLD: usize = 7; +pub const THRESHOLD: usize = 7; /// A conditionnal codec that either use the RoaringBitmap /// or a lighter ByteOrder en/decoding method. diff --git a/milli/src/heed_codec/roaring_bitmap/mod.rs b/milli/src/heed_codec/roaring_bitmap/mod.rs new file mode 100644 index 000000000..6f8045c92 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_codec; +pub mod cbo_roaring_bitmap_codec; +mod roaring_bitmap_codec; + +pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; +pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; +pub use self::roaring_bitmap_codec::RoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs similarity index 100% rename from milli/src/heed_codec/roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs diff --git a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..e749680a0 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs @@ -0,0 +1,11 @@ +use std::mem; + +pub struct BoRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + Some((bytes.len() / mem::size_of::()) as u64) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..4f728f1cd --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs @@ -0,0 +1,22 @@ +use std::mem; + +use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; + +pub struct CboRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + if bytes.len() <= THRESHOLD * mem::size_of::() { + // If there is threshold or less than threshold integers that can fit into this array + // of bytes it means that we used the ByteOrder codec serializer. + BoRoaringBitmapLenCodec::bytes_decode(bytes) + } else { + // Otherwise, it means we used the classic RoaringBitmapCodec and + // that the header takes threshold integers. + RoaringBitmapLenCodec::bytes_decode(bytes) + } + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/mod.rs b/milli/src/heed_codec/roaring_bitmap_length/mod.rs new file mode 100644 index 000000000..e503c5c7a --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_len_codec; +mod cbo_roaring_bitmap_len_codec; +mod roaring_bitmap_len_codec; + +pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec; +pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec; +pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..042b5cf6b --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -0,0 +1,83 @@ +use std::io::{self, Read, BufRead}; +use std::mem; + +use byteorder::{ReadBytesExt, LittleEndian}; + +const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +const SERIAL_COOKIE: u16 = 12347; + +pub struct RoaringBitmapLenCodec; + +impl RoaringBitmapLenCodec { + // FIXME should be exported in the RoaringBitmap crate + fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result { + let (size, has_offsets) = { + let cookie = bytes.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (bytes.read_u32::()? as usize, true) + } else if (cookie as u16) == SERIAL_COOKIE { + return Err(io::Error::new( + io::ErrorKind::Other, + "run containers are unsupported", + )); + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + if size > u16::max_value() as usize + 1 { + return Err(io::Error::new( + io::ErrorKind::Other, + "size is greater than supported", + )); + } + + let mut description_bytes = vec![0u8; size * 4]; + bytes.read_exact(&mut description_bytes)?; + let description_bytes = &mut &description_bytes[..]; + + if has_offsets { + bytes.consume(size * 4); + } + + let mut length = 0; + for _ in 0..size { + let _key = description_bytes.read_u16::()?; + let len = u64::from(description_bytes.read_u16::()?) + 1; + length += len; + + if len <= 4096 { + bytes.consume(len as usize * mem::size_of::()); + } else { + bytes.consume(1024 * mem::size_of::()) + } + } + + Ok(length) + } +} + +impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::heed_codec::RoaringBitmapCodec; + use heed::BytesEncode; + use roaring::RoaringBitmap; + + #[test] + fn deserialize_roaring_bitmap_length() { + let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); + let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap(); + assert_eq!(bitmap.len(), len); + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 12ad86b22..5c5fc9895 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,8 +12,8 @@ use crate::fields_ids_map::FieldsIdsMap; use crate::{default_criteria, Criterion, Search, FacetDistribution}; use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{ - RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, - BoRoaringBitmapCodec, CboRoaringBitmapCodec, + RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, + StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; pub const CRITERIA_KEY: &str = "criteria"; @@ -352,6 +352,17 @@ impl Index { } } + /* word documents count */ + + /// Returns the number of documents ids associated with the given word, + /// it is much faster than deserializing the bitmap and getting the length of it. + pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> anyhow::Result> { + self.word_docids + .remap_data_type::() + .get(rtxn, word) + .map_err(Into::into) + } + /* documents */ /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 66d134f4e..0fa966ee8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -26,6 +26,7 @@ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; +pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; pub use self::update_store::UpdateStore;