285: Support documents with at most 65536 fields r=Kerollmops a=Kerollmops

Fixes #248.

In this PR I updated the `obkv` crate, it now supports arbitrary key length and therefore I was able to use an `u16` to represent the fields instead of a single byte. It was impressively easy to update the whole codebase 🍡 🍔

Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2021-07-06 16:44:51 +00:00 committed by GitHub
commit 4c9531bdf3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 146 additions and 67 deletions

16
Cargo.lock generated
View File

@ -341,6 +341,17 @@ dependencies = [
"unicode-width", "unicode-width",
] ]
[[package]]
name = "concat-arrays"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747"
dependencies = [
"proc-macro2 1.0.27",
"quote 1.0.9",
"syn 1.0.73",
]
[[package]] [[package]]
name = "convert_case" name = "convert_case"
version = "0.4.0" version = "0.4.0"
@ -1378,6 +1389,7 @@ dependencies = [
"bstr", "bstr",
"byteorder", "byteorder",
"chrono", "chrono",
"concat-arrays",
"csv", "csv",
"either", "either",
"flate2", "flate2",
@ -1609,9 +1621,9 @@ dependencies = [
[[package]] [[package]]
name = "obkv" name = "obkv"
version = "0.1.1" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
[[package]] [[package]]
name = "once_cell" name = "once_cell"

View File

@ -7,7 +7,7 @@ use byte_unit::Byte;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::facet::FacetType; use milli::facet::FacetType;
use milli::index::db_name::*; use milli::index::db_name::*;
use milli::{Index, TreeLevel}; use milli::{FieldId, Index, TreeLevel};
use structopt::StructOpt; use structopt::StructOpt;
use Command::*; use Command::*;
@ -322,7 +322,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
rtxn: &'txn heed::RoTxn, rtxn: &'txn heed::RoTxn,
db: heed::Database<KC, DC>, db: heed::Database<KC, DC>,
field_id: u8, field_id: FieldId,
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>> ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
where where
KC: heed::BytesDecode<'txn>, KC: heed::BytesDecode<'txn>,
@ -330,7 +330,7 @@ where
{ {
let iter = db let iter = db
.remap_key_type::<heed::types::ByteSlice>() .remap_key_type::<heed::types::ByteSlice>()
.prefix_iter(&rtxn, &[field_id])? .prefix_iter(&rtxn, &field_id.to_be_bytes())?
.remap_key_type::<KC>(); .remap_key_type::<KC>();
Ok(Box::new(iter)) Ok(Box::new(iter))

View File

@ -8,6 +8,7 @@ edition = "2018"
bstr = "0.2.15" bstr = "0.2.15"
byteorder = "1.4.2" byteorder = "1.4.2"
chrono = { version = "0.4.19", features = ["serde"] } chrono = { version = "0.4.19", features = ["serde"] }
concat-arrays = "0.1.2"
csv = "1.1.5" csv = "1.1.5"
either = "1.6.1" either = "1.6.1"
flate2 = "1.0.20" flate2 = "1.0.20"
@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.4" linked-hash-map = "0.5.4"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
memmap = "0.7.0" memmap = "0.7.0"
obkv = "0.1.1" obkv = "0.2.0"
once_cell = "1.5.2" once_cell = "1.5.2"
ordered-float = "2.1.1" ordered-float = "2.1.1"
rayon = "1.5.0" rayon = "1.5.0"

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::convert::TryInto; use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::FieldId; use crate::{try_split_array_at, FieldId};
// TODO do not de/serialize right bound when level = 0 // TODO do not de/serialize right bound when level = 0
pub struct FacetLevelValueF64Codec; pub struct FacetLevelValueF64Codec;
@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
type DItem = (FieldId, u8, f64, f64); type DItem = (FieldId, u8, f64, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (level, bytes) = bytes.split_first()?; let (level, bytes) = bytes.split_first()?;
let (left, right) = if *level != 0 { let (left, right) = if *level != 0 {
@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
(left, left) (left, left)
}; };
Some((*field_id, *level, left, right)) Some((field_id, *level, left, right))
} }
} }
@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
16 // length 16 // length
}; };
let mut bytes = Vec::with_capacity(len + 2); let mut bytes = Vec::with_capacity(len + 3);
bytes.push(*field_id); bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*level); bytes.push(*level);
bytes.extend_from_slice(&buffer[..len]); bytes.extend_from_slice(&buffer[..len]);
Some(Cow::Owned(bytes)) Some(Cow::Owned(bytes))

View File

@ -1,14 +1,14 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::str; use std::str;
use crate::FieldId; use crate::{try_split_array_at, FieldId};
pub struct FacetValueStringCodec; pub struct FacetValueStringCodec;
impl FacetValueStringCodec { impl FacetValueStringCodec {
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) { pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
out.reserve(value.len() + 1); out.reserve(value.len() + 2);
out.push(field_id); out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(value.as_bytes()); out.extend_from_slice(value.as_bytes());
} }
} }
@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
type DItem = (FieldId, &'a str); type DItem = (FieldId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let value = str::from_utf8(bytes).ok()?; let value = str::from_utf8(bytes).ok()?;
Some((*field_id, value)) Some((field_id, value))
} }
} }

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::convert::TryInto; use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::{DocumentId, FieldId}; use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetF64Codec; pub struct FieldDocIdFacetF64Codec;
@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
type DItem = (FieldId, DocumentId, f64); type DItem = (FieldId, DocumentId, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (document_id_bytes, bytes) = bytes.split_at(4); let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; let document_id = u32::from_be_bytes(document_id_bytes);
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
Some((*field_id, document_id, value)) Some((field_id, document_id, value))
} }
} }
@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
type EItem = (FieldId, DocumentId, f64); type EItem = (FieldId, DocumentId, f64);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8); let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8);
bytes.push(*field_id); bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.extend_from_slice(&document_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes());
let value_bytes = f64_into_bytes(*value)?; let value_bytes = f64_into_bytes(*value)?;
bytes.extend_from_slice(&value_bytes); bytes.extend_from_slice(&value_bytes);

View File

@ -1,8 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::convert::TryInto;
use std::str; use std::str;
use crate::{DocumentId, FieldId}; use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetStringCodec; pub struct FieldDocIdFacetStringCodec;
@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec {
value: &str, value: &str,
out: &mut Vec<u8>, out: &mut Vec<u8>,
) { ) {
out.reserve(1 + 4 + value.len()); out.reserve(2 + 4 + value.len());
out.push(field_id); out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(&document_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes());
out.extend_from_slice(value.as_bytes()); out.extend_from_slice(value.as_bytes());
} }
@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
type DItem = (FieldId, DocumentId, &'a str); type DItem = (FieldId, DocumentId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let (document_id_bytes, bytes) = bytes.split_at(4); let field_id = u16::from_be_bytes(field_id_bytes);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes);
let value = str::from_utf8(bytes).ok()?; let value = str::from_utf8(bytes).ok()?;
Some((*field_id, document_id, value)) Some((field_id, document_id, value))
} }
} }

View File

@ -1,7 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::convert::TryInto;
use crate::FieldId; use crate::{try_split_array_at, FieldId};
pub struct FieldIdWordCountCodec; pub struct FieldIdWordCountCodec;
@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
type DItem = (FieldId, u8); type DItem = (FieldId, u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let ([word_count], _nothing) = try_split_array_at(bytes)?;
Some((field_id, word_count)) Some((field_id, word_count))
} }
} }
@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
type EItem = (FieldId, u8); type EItem = (FieldId, u8);
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Owned(vec![*field_id, *word_count])) let mut bytes = Vec::with_capacity(2 + 1);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*word_count);
Some(Cow::Owned(bytes))
} }
} }

View File

@ -1,19 +1,19 @@
use std::borrow::Cow; use std::borrow::Cow;
use obkv::{KvReader, KvWriter}; use obkv::{KvReaderU16, KvWriterU16};
pub struct ObkvCodec; pub struct ObkvCodec;
impl<'a> heed::BytesDecode<'a> for ObkvCodec { impl<'a> heed::BytesDecode<'a> for ObkvCodec {
type DItem = KvReader<'a>; type DItem = KvReaderU16<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(KvReader::new(bytes)) Some(KvReaderU16::new(bytes))
} }
} }
impl heed::BytesEncode<'_> for ObkvCodec { impl heed::BytesEncode<'_> for ObkvCodec {
type EItem = KvWriter<Vec<u8>>; type EItem = KvWriterU16<Vec<u8>>;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
item.clone().into_inner().map(Cow::Owned).ok() item.clone().into_inner().map(Cow::Owned).ok()

View File

@ -523,10 +523,11 @@ impl Index {
field_id: FieldId, field_id: FieldId,
docids: &RoaringBitmap, docids: &RoaringBitmap,
) -> heed::Result<()> { ) -> heed::Result<()> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
} }
@ -536,10 +537,11 @@ impl Index {
rtxn: &RoTxn, rtxn: &RoTxn,
field_id: FieldId, field_id: FieldId,
) -> heed::Result<RoaringBitmap> { ) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids), Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
@ -553,10 +555,11 @@ impl Index {
field_id: FieldId, field_id: FieldId,
docids: &RoaringBitmap, docids: &RoaringBitmap,
) -> heed::Result<()> { ) -> heed::Result<()> {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
} }
@ -569,7 +572,8 @@ impl Index {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
.copy_from_slice(&field_id.to_be_bytes());
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids), Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
@ -723,7 +727,7 @@ impl Index {
&self, &self,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>, ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> { ) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let mut documents = Vec::new(); let mut documents = Vec::new();
for id in ids { for id in ids {
@ -741,7 +745,7 @@ impl Index {
pub fn all_documents<'t>( pub fn all_documents<'t>(
&self, &self,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReader<'t>)>>> { ) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
Ok(self Ok(self
.documents .documents
.iter(rtxn)? .iter(rtxn)?

View File

@ -15,6 +15,7 @@ pub mod update;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use std::result::Result as StdResult; use std::result::Result as StdResult;
@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
pub type Attribute = u32; pub type Attribute = u32;
pub type DocumentId = u32; pub type DocumentId = u32;
pub type FieldId = u8; pub type FieldId = u16;
pub type Position = u32; pub type Position = u32;
pub type FieldDistribution = BTreeMap<String, u64>; pub type FieldDistribution = BTreeMap<String, u64>;
@ -58,7 +59,7 @@ type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
pub fn obkv_to_json( pub fn obkv_to_json(
displayed_fields: &[FieldId], displayed_fields: &[FieldId],
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
obkv: obkv::KvReader, obkv: obkv::KvReaderU16,
) -> Result<Map<String, Value>> { ) -> Result<Map<String, Value>> {
displayed_fields displayed_fields
.iter() .iter()
@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option<String> {
} }
} }
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
if mid <= slice.len() {
Some(slice.split_at(mid))
} else {
None
}
}
/// Divides one slice into an array and the tail at an index,
/// returns `None` if `N` is out of bounds.
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
where
[T; N]: for<'a> TryFrom<&'a [T]>,
{
let (head, tail) = try_split_at(slice, N)?;
let head = head.try_into().ok()?;
Some((head, tail))
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use serde_json::json; use serde_json::json;

View File

@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> {
K: fmt::Display, K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
{ {
let mut key_buffer = vec![field_id]; let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1); key_buffer.truncate(1);
@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> {
.index .index
.facet_id_string_docids .facet_id_string_docids
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])? .prefix_iter(self.rtxn, &field_id.to_be_bytes())?
.remap_key_type::<FacetValueStringCodec>(); .remap_key_type::<FacetValueStringCodec>();
for result in iter { for result in iter {

View File

@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> {
) -> heed::Result<Option<u8>> { ) -> heed::Result<Option<u8>> {
let level = db let level = db
.remap_types::<ByteSlice, DecodeIgnore>() .remap_types::<ByteSlice, DecodeIgnore>()
.prefix_iter(rtxn, &[fid][..])? .prefix_iter(rtxn, &fid.to_be_bytes())?
.remap_key_type::<FacetLevelValueF64Codec>() .remap_key_type::<FacetLevelValueF64Codec>()
.last() .last()
.transpose()? .transpose()?

View File

@ -430,8 +430,10 @@ where
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
F: Fn(K) -> DocumentId, F: Fn(K) -> DocumentId,
{ {
let mut iter = let mut iter = db
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>(); .remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
.remap_key_type::<C>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (key, ()) = result?; let (key, ()) = result?;

View File

@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
}; };
use crate::{Index, Result}; use crate::{FieldId, Index, Result};
pub struct Facets<'t, 'u, 'i> { pub struct Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
fn clear_field_number_levels<'t>( fn clear_field_number_levels<'t>(
wtxn: &'t mut heed::RwTxn, wtxn: &'t mut heed::RwTxn,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
field_id: u8, field_id: FieldId,
) -> heed::Result<()> { ) -> heed::Result<()> {
let left = (field_id, 1, f64::MIN, f64::MIN); let left = (field_id, 1, f64::MIN, f64::MIN);
let right = (field_id, u8::MAX, f64::MAX, f64::MAX); let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>(
shrink_size: Option<u64>, shrink_size: Option<u64>,
level_group_size: NonZeroUsize, level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize, min_level_size: NonZeroUsize,
field_id: u8, field_id: FieldId,
) -> Result<Reader<FileFuse>> { ) -> Result<Reader<FileFuse>> {
let first_level_size = db let first_level_size = db
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &[field_id])? .prefix_iter(rtxn, &field_id.to_be_bytes())?
.remap_types::<DecodeIgnore, DecodeIgnore>() .remap_types::<DecodeIgnore, DecodeIgnore>()
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>(
fn compute_faceted_documents_ids( fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8, field_id: FieldId,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? { for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
let (_key, docids) = result?; let (_key, docids) = result?;
documents_ids |= docids; documents_ids |= docids;
} }
@ -210,7 +210,7 @@ fn compute_faceted_documents_ids(
fn write_number_entry( fn write_number_entry(
writer: &mut Writer<File>, writer: &mut Writer<File>,
field_id: u8, field_id: FieldId,
level: u8, level: u8,
left: f64, left: f64,
right: f64, right: f64,

View File

@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
Ok(values.first().unwrap().to_vec()) Ok(values.first().unwrap().to_vec())
} }
pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec<u8>) { pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
use itertools::merge_join_by; use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right}; use itertools::EitherOrBoth::{Both, Left, Right};

View File

@ -842,10 +842,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
mod tests { mod tests {
use std::io::Cursor; use std::io::Cursor;
use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use super::*; use super::*;
use crate::update::DeleteDocuments; use crate::update::DeleteDocuments;
use crate::HashMap;
#[test] #[test]
fn simple_document_replacement() { fn simple_document_replacement() {
@ -1352,4 +1354,30 @@ mod tests {
wtxn.commit().unwrap(); wtxn.commit().unwrap();
} }
#[test]
fn index_more_than_256_fields() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
for i in 0..1000 {
let key = i.to_string();
big_object.insert(key, "I am a text!");
}
let content = vec![big_object];
let content = serde_json::to_string(&content).unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
} }

View File

@ -7,6 +7,7 @@ use std::time::Instant;
use std::{cmp, iter}; use std::{cmp, iter};
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use concat_arrays::concat_arrays;
use fst::Set; use fst::Set;
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode; use heed::BytesEncode;
@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for ((fid, count), docids) in self.field_id_word_count_docids { for ((fid, count), docids) in self.field_id_word_count_docids {
docids_buffer.clear(); docids_buffer.clear();
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]);
self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?;
} }
let fst = builder.into_set(); let fst = builder.into_set();

View File

@ -626,7 +626,7 @@ mod test {
Some("tata".to_string()), Some("tata".to_string()),
false, false,
); );
assert_eq!(result.unwrap(), (0u8, "toto".to_string())); assert_eq!(result.unwrap(), (0, "toto".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }
@ -635,7 +635,7 @@ mod test {
let mut fields_map = FieldsIdsMap::new(); let mut fields_map = FieldsIdsMap::new();
let result = let result =
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string())); assert_eq!(result.unwrap(), (0, "tata".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }
@ -643,7 +643,7 @@ mod test {
fn should_return_default_if_both_are_none() { fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new(); let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(None, &mut fields_map, None, true); let result = compute_primary_key_pair(None, &mut fields_map, None, true);
assert_eq!(result.unwrap(), (0u8, "id".to_string())); assert_eq!(result.unwrap(), (0, "id".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }