Use an u16 field id instead of one byte

This commit is contained in:
Kerollmops 2021-07-06 11:31:24 +02:00
parent cc54c41e30
commit 838ed1cd32
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
17 changed files with 115 additions and 64 deletions

16
Cargo.lock generated
View File

@ -341,6 +341,17 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "concat-arrays"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747"
dependencies = [
"proc-macro2 1.0.27",
"quote 1.0.9",
"syn 1.0.73",
]
[[package]]
name = "convert_case"
version = "0.4.0"
@ -1378,6 +1389,7 @@ dependencies = [
"bstr",
"byteorder",
"chrono",
"concat-arrays",
"csv",
"either",
"flate2",
@ -1609,9 +1621,9 @@ dependencies = [
[[package]]
name = "obkv"
version = "0.1.1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8"
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
[[package]]
name = "once_cell"

View File

@ -8,6 +8,7 @@ edition = "2018"
bstr = "0.2.15"
byteorder = "1.4.2"
chrono = { version = "0.4.19", features = ["serde"] }
concat-arrays = "0.1.2"
csv = "1.1.5"
either = "1.6.1"
flate2 = "1.0.20"
@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.4"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
memmap = "0.7.0"
obkv = "0.1.1"
obkv = "0.2.0"
once_cell = "1.5.2"
ordered-float = "2.1.1"
rayon = "1.5.0"

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
use crate::FieldId;
use crate::{try_split_array_at, FieldId};
// TODO do not de/serialize right bound when level = 0
pub struct FacetLevelValueF64Codec;
@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
type DItem = (FieldId, u8, f64, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (level, bytes) = bytes.split_first()?;
let (left, right) = if *level != 0 {
@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
(left, left)
};
Some((*field_id, *level, left, right))
Some((field_id, *level, left, right))
}
}
@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
16 // length
};
let mut bytes = Vec::with_capacity(len + 2);
bytes.push(*field_id);
let mut bytes = Vec::with_capacity(len + 3);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*level);
bytes.extend_from_slice(&buffer[..len]);
Some(Cow::Owned(bytes))

View File

@ -1,14 +1,14 @@
use std::borrow::Cow;
use std::str;
use crate::FieldId;
use crate::{try_split_array_at, FieldId};
pub struct FacetValueStringCodec;
impl FacetValueStringCodec {
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
out.reserve(value.len() + 1);
out.push(field_id);
out.reserve(value.len() + 2);
out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(value.as_bytes());
}
}
@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
type DItem = (FieldId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let value = str::from_utf8(bytes).ok()?;
Some((*field_id, value))
Some((field_id, value))
}
}

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
use crate::{DocumentId, FieldId};
use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetF64Codec;
@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
type DItem = (FieldId, DocumentId, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (document_id_bytes, bytes) = bytes.split_at(4);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes);
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
Some((*field_id, document_id, value))
Some((field_id, document_id, value))
}
}
@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
type EItem = (FieldId, DocumentId, f64);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8);
bytes.push(*field_id);
let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.extend_from_slice(&document_id.to_be_bytes());
let value_bytes = f64_into_bytes(*value)?;
bytes.extend_from_slice(&value_bytes);

View File

@ -1,8 +1,7 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::{DocumentId, FieldId};
use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetStringCodec;
@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec {
value: &str,
out: &mut Vec<u8>,
) {
out.reserve(1 + 4 + value.len());
out.push(field_id);
out.reserve(2 + 4 + value.len());
out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(&document_id.to_be_bytes());
out.extend_from_slice(value.as_bytes());
}
@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
type DItem = (FieldId, DocumentId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (document_id_bytes, bytes) = bytes.split_at(4);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes);
let value = str::from_utf8(bytes).ok()?;
Some((*field_id, document_id, value))
Some((field_id, document_id, value))
}
}

View File

@ -1,7 +1,6 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::FieldId;
use crate::{try_split_array_at, FieldId};
pub struct FieldIdWordCountCodec;
@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
type DItem = (FieldId, u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?;
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let ([word_count], _nothing) = try_split_array_at(bytes)?;
Some((field_id, word_count))
}
}
@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
type EItem = (FieldId, u8);
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Owned(vec![*field_id, *word_count]))
let mut bytes = Vec::with_capacity(2 + 1);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*word_count);
Some(Cow::Owned(bytes))
}
}

View File

@ -1,19 +1,19 @@
use std::borrow::Cow;
use obkv::{KvReader, KvWriter};
use obkv::{KvReaderU16, KvWriterU16};
pub struct ObkvCodec;
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
type DItem = KvReader<'a>;
type DItem = KvReaderU16<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(KvReader::new(bytes))
Some(KvReaderU16::new(bytes))
}
}
impl heed::BytesEncode<'_> for ObkvCodec {
type EItem = KvWriter<Vec<u8>>;
type EItem = KvWriterU16<Vec<u8>>;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
item.clone().into_inner().map(Cow::Owned).ok()

View File

@ -523,10 +523,11 @@ impl Index {
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
@ -536,10 +537,11 @@ impl Index {
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
@ -553,10 +555,11 @@ impl Index {
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()> {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
@ -569,7 +572,8 @@ impl Index {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
@ -723,7 +727,7 @@ impl Index {
&self,
rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let mut documents = Vec::new();
for id in ids {
@ -741,7 +745,7 @@ impl Index {
pub fn all_documents<'t>(
&self,
rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReader<'t>)>>> {
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
Ok(self
.documents
.iter(rtxn)?

View File

@ -15,6 +15,7 @@ pub mod update;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault;
use std::result::Result as StdResult;
@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
pub type Attribute = u32;
pub type DocumentId = u32;
pub type FieldId = u8;
pub type FieldId = u16;
pub type Position = u32;
pub type FieldDistribution = BTreeMap<String, u64>;
@ -58,7 +59,7 @@ type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
pub fn obkv_to_json(
displayed_fields: &[FieldId],
fields_ids_map: &FieldsIdsMap,
obkv: obkv::KvReader,
obkv: obkv::KvReaderU16,
) -> Result<Map<String, Value>> {
displayed_fields
.iter()
@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option<String> {
}
}
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
if mid <= slice.len() {
Some(slice.split_at(mid))
} else {
None
}
}
/// Divides one slice into an array and the tail at an index,
/// returns `None` if `N` is out of bounds.
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
where
[T; N]: for<'a> TryFrom<&'a [T]>,
{
let (head, tail) = try_split_at(slice, N)?;
let head = head.try_into().ok()?;
Some((head, tail))
}
#[cfg(test)]
mod tests {
use serde_json::json;

View File

@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> {
K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
{
let mut key_buffer = vec![field_id];
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1);
@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> {
.index
.facet_id_string_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])?
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
.remap_key_type::<FacetValueStringCodec>();
for result in iter {

View File

@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> {
) -> heed::Result<Option<u8>> {
let level = db
.remap_types::<ByteSlice, DecodeIgnore>()
.prefix_iter(rtxn, &[fid][..])?
.prefix_iter(rtxn, &fid.to_be_bytes())?
.remap_key_type::<FacetLevelValueF64Codec>()
.last()
.transpose()?

View File

@ -430,8 +430,10 @@ where
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
F: Fn(K) -> DocumentId,
{
let mut iter =
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
let mut iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
.remap_key_type::<C>();
while let Some(result) = iter.next() {
let (key, ()) = result?;

View File

@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, Result};
use crate::{FieldId, Index, Result};
pub struct Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
fn clear_field_number_levels<'t>(
wtxn: &'t mut heed::RwTxn,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
field_id: u8,
field_id: FieldId,
) -> heed::Result<()> {
let left = (field_id, 1, f64::MIN, f64::MIN);
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>(
shrink_size: Option<u64>,
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
field_id: u8,
field_id: FieldId,
) -> Result<Reader<FileFuse>> {
let first_level_size = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &[field_id])?
.prefix_iter(rtxn, &field_id.to_be_bytes())?
.remap_types::<DecodeIgnore, DecodeIgnore>()
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>(
fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
field_id: FieldId,
) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? {
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
let (_key, docids) = result?;
documents_ids |= docids;
}
@ -210,7 +210,7 @@ fn compute_faceted_documents_ids(
fn write_number_entry(
writer: &mut Writer<File>,
field_id: u8,
field_id: FieldId,
level: u8,
left: f64,
right: f64,

View File

@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
Ok(values.first().unwrap().to_vec())
}
pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec<u8>) {
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};

View File

@ -7,6 +7,7 @@ use std::time::Instant;
use std::{cmp, iter};
use bstr::ByteSlice as _;
use concat_arrays::concat_arrays;
use fst::Set;
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode;
@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for ((fid, count), docids) in self.field_id_word_count_docids {
docids_buffer.clear();
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?;
let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]);
self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?;
}
let fst = builder.into_set();

View File

@ -626,7 +626,7 @@ mod test {
Some("tata".to_string()),
false,
);
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(result.unwrap(), (0, "toto".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -635,7 +635,7 @@ mod test {
let mut fields_map = FieldsIdsMap::new();
let result =
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(result.unwrap(), (0, "tata".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -643,7 +643,7 @@ mod test {
fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(result.unwrap(), (0, "id".to_string()));
assert_eq!(fields_map.len(), 1);
}