mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
Use the CboRoaringBitmapCodec for the word pair proximity docids
This commit is contained in:
parent
5a6a698e1d
commit
d0c73564b1
@ -21,7 +21,7 @@ use rayon::prelude::*;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use milli::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec};
|
use milli::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use milli::tokenizer::{simple_tokenizer, only_token};
|
use milli::tokenizer::{simple_tokenizer, only_token};
|
||||||
use milli::{SmallVec32, Index, Position, DocumentId, BEU32};
|
use milli::{SmallVec32, Index, Position, DocumentId, BEU32};
|
||||||
|
|
||||||
@ -335,8 +335,8 @@ impl Store {
|
|||||||
key.push(min_prox);
|
key.push(min_prox);
|
||||||
// We serialize the document ids into a buffer
|
// We serialize the document ids into a buffer
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.reserve(docids.serialized_size());
|
buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids));
|
||||||
docids.serialize_into(&mut buffer)?;
|
CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer)?;
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
if lmdb_key_valid_size(&key) {
|
if lmdb_key_valid_size(&key) {
|
||||||
sorter.insert(&key, &buffer)?;
|
sorter.insert(&key, &buffer)?;
|
||||||
@ -365,7 +365,7 @@ impl Store {
|
|||||||
// We serialize the positions into a buffer.
|
// We serialize the positions into a buffer.
|
||||||
let positions = RoaringBitmap::from_iter(positions.iter().cloned());
|
let positions = RoaringBitmap::from_iter(positions.iter().cloned());
|
||||||
let bytes = BoRoaringBitmapCodec::bytes_encode(&positions)
|
let bytes = BoRoaringBitmapCodec::bytes_encode(&positions)
|
||||||
.with_context(|| format!("could not serialize positions"))?;
|
.with_context(|| "could not serialize positions")?;
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
if lmdb_key_valid_size(&key) {
|
if lmdb_key_valid_size(&key) {
|
||||||
sorter.insert(&key, &bytes)?;
|
sorter.insert(&key, &bytes)?;
|
||||||
@ -515,10 +515,10 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
||||||
Ok(values[0].to_vec())
|
Ok(values[0].to_vec())
|
||||||
},
|
},
|
||||||
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => {
|
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => {
|
||||||
let (head, tail) = values.split_first().unwrap();
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
|
||||||
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
||||||
|
|
||||||
for value in tail {
|
for value in tail {
|
||||||
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
|
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
|
||||||
head.union_with(&bitmap);
|
head.union_with(&bitmap);
|
||||||
@ -528,6 +528,19 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
head.serialize_into(&mut vec).unwrap();
|
head.serialize_into(&mut vec).unwrap();
|
||||||
Ok(vec)
|
Ok(vec)
|
||||||
},
|
},
|
||||||
|
WORDS_PROXIMITIES_BYTE => {
|
||||||
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
let mut head = CboRoaringBitmapCodec::deserialize_from(head.as_slice()).unwrap();
|
||||||
|
|
||||||
|
for value in tail {
|
||||||
|
let bitmap = CboRoaringBitmapCodec::deserialize_from(value.as_slice()).unwrap();
|
||||||
|
head.union_with(&bitmap);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut vec = Vec::new();
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&head, &mut vec).unwrap();
|
||||||
|
Ok(vec)
|
||||||
|
},
|
||||||
otherwise => panic!("wut {:?}", otherwise),
|
otherwise => panic!("wut {:?}", otherwise),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -385,12 +385,12 @@ fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Re
|
|||||||
|
|
||||||
fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::DecodeIgnore;
|
||||||
use milli::RoaringBitmapCodec;
|
use milli::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
let mut values_length = Vec::new();
|
let mut values_length = Vec::new();
|
||||||
|
|
||||||
let db = index.word_pair_proximity_docids.as_polymorph();
|
let db = index.word_pair_proximity_docids.as_polymorph();
|
||||||
for result in db.iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)? {
|
for result in db.iter::<_, DecodeIgnore, CboRoaringBitmapCodec>(rtxn)? {
|
||||||
let ((), val) = result?;
|
let ((), val) = result?;
|
||||||
values_length.push(val.len() as u32);
|
values_length.push(val.len() as u32);
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,60 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::io;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
|
||||||
|
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use super::{BoRoaringBitmapCodec, RoaringBitmapCodec};
|
|
||||||
|
|
||||||
/// A conditionnal codec that either use the RoaringBitmap
|
/// A conditionnal codec that either use the RoaringBitmap
|
||||||
/// or a lighter ByteOrder en/decoding method.
|
/// or a lighter ByteOrder en/decoding method.
|
||||||
pub struct CboRoaringBitmapCodec;
|
pub struct CboRoaringBitmapCodec;
|
||||||
|
|
||||||
|
impl CboRoaringBitmapCodec {
|
||||||
|
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
||||||
|
if roaring.len() <= 4 {
|
||||||
|
roaring.len() as usize * size_of::<u32>()
|
||||||
|
} else {
|
||||||
|
roaring.serialized_size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) -> io::Result<()> {
|
||||||
|
if roaring.len() <= 4 {
|
||||||
|
// If the number of items (u32s) to encode is less than or equal to 4
|
||||||
|
// it means that it would weigh the same or less than the RoaringBitmap
|
||||||
|
// header, so we directly encode them using ByteOrder instead.
|
||||||
|
for integer in roaring {
|
||||||
|
vec.write_u32::<NativeEndian>(integer)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
||||||
|
roaring.serialize_into(vec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||||
|
if bytes.len() <= 4 * size_of::<u32>() {
|
||||||
|
// If there is 4 or less than 4 integers that can fit into this array
|
||||||
|
// of bytes it means that we used the ByteOrder codec serializer.
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||||
|
bitmap.insert(integer);
|
||||||
|
}
|
||||||
|
Ok(bitmap)
|
||||||
|
} else {
|
||||||
|
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||||
|
// that the header takes 4 integers.
|
||||||
|
RoaringBitmap::deserialize_from(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||||
type DItem = RoaringBitmap;
|
type DItem = RoaringBitmap;
|
||||||
|
|
||||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||||
if bytes.len() <= 4 * size_of::<u32>() {
|
Self::deserialize_from(bytes).ok()
|
||||||
// If there is 4 or less than 4 integers that can fit into this array
|
|
||||||
// of bytes it means that we used the ByteOrder codec serializer.
|
|
||||||
BoRoaringBitmapCodec::bytes_decode(bytes)
|
|
||||||
} else {
|
|
||||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
|
||||||
// that the header takes 4 integers.
|
|
||||||
RoaringBitmapCodec::bytes_decode(bytes)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,14 +62,8 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
|||||||
type EItem = RoaringBitmap;
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
if item.len() <= 4 {
|
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
||||||
// If the number of items (u32s) to encode is less than or equal to 4
|
Self::serialize_into(item, &mut vec).ok()?;
|
||||||
// it means that it would weigh the same or less than the RoaringBitmap
|
Some(Cow::Owned(vec))
|
||||||
// header, so we directly encode them using ByteOrder instead.
|
|
||||||
BoRoaringBitmapCodec::bytes_encode(item)
|
|
||||||
} else {
|
|
||||||
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
|
||||||
RoaringBitmapCodec::bytes_encode(item)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ pub use self::search::{Search, SearchResult};
|
|||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
||||||
CsvStringRecordCodec, BoRoaringBitmapCodec,
|
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
@ -44,7 +44,7 @@ pub struct Index {
|
|||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the document id to the document as a CSV line.
|
/// Maps the document id to the document as a CSV line.
|
||||||
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user