mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Merge #285
285: Support documents with at most 65536 fields r=Kerollmops a=Kerollmops Fixes #248. In this PR I updated the `obkv` crate, it now supports arbitrary key length and therefore I was able to use an `u16` to represent the fields instead of a single byte. It was impressively easy to update the whole codebase 🍡 🍔 Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
4c9531bdf3
16
Cargo.lock
generated
16
Cargo.lock
generated
@ -341,6 +341,17 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "concat-arrays"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747"
|
||||
dependencies = [
|
||||
"proc-macro2 1.0.27",
|
||||
"quote 1.0.9",
|
||||
"syn 1.0.73",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.4.0"
|
||||
@ -1378,6 +1389,7 @@ dependencies = [
|
||||
"bstr",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"concat-arrays",
|
||||
"csv",
|
||||
"either",
|
||||
"flate2",
|
||||
@ -1609,9 +1621,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obkv"
|
||||
version = "0.1.1"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8"
|
||||
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
|
@ -7,7 +7,7 @@ use byte_unit::Byte;
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::facet::FacetType;
|
||||
use milli::index::db_name::*;
|
||||
use milli::{Index, TreeLevel};
|
||||
use milli::{FieldId, Index, TreeLevel};
|
||||
use structopt::StructOpt;
|
||||
use Command::*;
|
||||
|
||||
@ -322,7 +322,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
|
||||
fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
|
||||
rtxn: &'txn heed::RoTxn,
|
||||
db: heed::Database<KC, DC>,
|
||||
field_id: u8,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
|
||||
where
|
||||
KC: heed::BytesDecode<'txn>,
|
||||
@ -330,7 +330,7 @@ where
|
||||
{
|
||||
let iter = db
|
||||
.remap_key_type::<heed::types::ByteSlice>()
|
||||
.prefix_iter(&rtxn, &[field_id])?
|
||||
.prefix_iter(&rtxn, &field_id.to_be_bytes())?
|
||||
.remap_key_type::<KC>();
|
||||
|
||||
Ok(Box::new(iter))
|
||||
|
@ -8,6 +8,7 @@ edition = "2018"
|
||||
bstr = "0.2.15"
|
||||
byteorder = "1.4.2"
|
||||
chrono = { version = "0.4.19", features = ["serde"] }
|
||||
concat-arrays = "0.1.2"
|
||||
csv = "1.1.5"
|
||||
either = "1.6.1"
|
||||
flate2 = "1.0.20"
|
||||
@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
linked-hash-map = "0.5.4"
|
||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
|
||||
memmap = "0.7.0"
|
||||
obkv = "0.1.1"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.5.2"
|
||||
ordered-float = "2.1.1"
|
||||
rayon = "1.5.0"
|
||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::FieldId;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
// TODO do not de/serialize right bound when level = 0
|
||||
pub struct FacetLevelValueF64Codec;
|
||||
@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
|
||||
type DItem = (FieldId, u8, f64, f64);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
|
||||
let (left, right) = if *level != 0 {
|
||||
@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
|
||||
(left, left)
|
||||
};
|
||||
|
||||
Some((*field_id, *level, left, right))
|
||||
Some((field_id, *level, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
|
||||
16 // length
|
||||
};
|
||||
|
||||
let mut bytes = Vec::with_capacity(len + 2);
|
||||
bytes.push(*field_id);
|
||||
let mut bytes = Vec::with_capacity(len + 3);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(*level);
|
||||
bytes.extend_from_slice(&buffer[..len]);
|
||||
Some(Cow::Owned(bytes))
|
||||
|
@ -1,14 +1,14 @@
|
||||
use std::borrow::Cow;
|
||||
use std::str;
|
||||
|
||||
use crate::FieldId;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
pub struct FacetValueStringCodec;
|
||||
|
||||
impl FacetValueStringCodec {
|
||||
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
|
||||
out.reserve(value.len() + 1);
|
||||
out.push(field_id);
|
||||
out.reserve(value.len() + 2);
|
||||
out.extend_from_slice(&field_id.to_be_bytes());
|
||||
out.extend_from_slice(value.as_bytes());
|
||||
}
|
||||
}
|
||||
@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
|
||||
type DItem = (FieldId, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let value = str::from_utf8(bytes).ok()?;
|
||||
Some((*field_id, value))
|
||||
Some((field_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::{DocumentId, FieldId};
|
||||
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetF64Codec;
|
||||
|
||||
@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
|
||||
type DItem = (FieldId, DocumentId, f64);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, bytes) = bytes.split_at(4);
|
||||
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
|
||||
|
||||
Some((*field_id, document_id, value))
|
||||
Some((field_id, document_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
|
||||
type EItem = (FieldId, DocumentId, f64);
|
||||
|
||||
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8);
|
||||
bytes.push(*field_id);
|
||||
let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.extend_from_slice(&document_id.to_be_bytes());
|
||||
let value_bytes = f64_into_bytes(*value)?;
|
||||
bytes.extend_from_slice(&value_bytes);
|
||||
|
@ -1,8 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use crate::{DocumentId, FieldId};
|
||||
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetStringCodec;
|
||||
|
||||
@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec {
|
||||
value: &str,
|
||||
out: &mut Vec<u8>,
|
||||
) {
|
||||
out.reserve(1 + 4 + value.len());
|
||||
out.push(field_id);
|
||||
out.reserve(2 + 4 + value.len());
|
||||
out.extend_from_slice(&field_id.to_be_bytes());
|
||||
out.extend_from_slice(&document_id.to_be_bytes());
|
||||
out.extend_from_slice(value.as_bytes());
|
||||
}
|
||||
@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
|
||||
type DItem = (FieldId, DocumentId, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (document_id_bytes, bytes) = bytes.split_at(4);
|
||||
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = str::from_utf8(bytes).ok()?;
|
||||
Some((*field_id, document_id, value))
|
||||
Some((field_id, document_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::FieldId;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
pub struct FieldIdWordCountCodec;
|
||||
|
||||
@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
||||
type DItem = (FieldId, u8);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?;
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let ([word_count], _nothing) = try_split_array_at(bytes)?;
|
||||
Some((field_id, word_count))
|
||||
}
|
||||
}
|
||||
@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
||||
type EItem = (FieldId, u8);
|
||||
|
||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
Some(Cow::Owned(vec![*field_id, *word_count]))
|
||||
let mut bytes = Vec::with_capacity(2 + 1);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(*word_count);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,19 +1,19 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use obkv::{KvReaderU16, KvWriterU16};
|
||||
|
||||
pub struct ObkvCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||
type DItem = KvReader<'a>;
|
||||
type DItem = KvReaderU16<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
Some(KvReader::new(bytes))
|
||||
Some(KvReaderU16::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for ObkvCodec {
|
||||
type EItem = KvWriter<Vec<u8>>;
|
||||
type EItem = KvWriterU16<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
item.clone().into_inner().map(Cow::Owned).ok()
|
||||
|
@ -523,10 +523,11 @@ impl Index {
|
||||
field_id: FieldId,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
*buffer.last_mut().unwrap() = field_id;
|
||||
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||
.copy_from_slice(&field_id.to_be_bytes());
|
||||
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||
}
|
||||
|
||||
@ -536,10 +537,11 @@ impl Index {
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
*buffer.last_mut().unwrap() = field_id;
|
||||
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||
.copy_from_slice(&field_id.to_be_bytes());
|
||||
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
@ -553,10 +555,11 @@ impl Index {
|
||||
field_id: FieldId,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
*buffer.last_mut().unwrap() = field_id;
|
||||
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||
.copy_from_slice(&field_id.to_be_bytes());
|
||||
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||
}
|
||||
|
||||
@ -569,7 +572,8 @@ impl Index {
|
||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
*buffer.last_mut().unwrap() = field_id;
|
||||
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||
.copy_from_slice(&field_id.to_be_bytes());
|
||||
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
@ -723,7 +727,7 @@ impl Index {
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
ids: impl IntoIterator<Item = DocumentId>,
|
||||
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
|
||||
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
|
||||
let mut documents = Vec::new();
|
||||
|
||||
for id in ids {
|
||||
@ -741,7 +745,7 @@ impl Index {
|
||||
pub fn all_documents<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReader<'t>)>>> {
|
||||
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
|
||||
Ok(self
|
||||
.documents
|
||||
.iter(rtxn)?
|
||||
|
@ -15,6 +15,7 @@ pub mod update;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::hash::BuildHasherDefault;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||
pub type Attribute = u32;
|
||||
pub type DocumentId = u32;
|
||||
pub type FieldId = u8;
|
||||
pub type FieldId = u16;
|
||||
pub type Position = u32;
|
||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||
|
||||
@ -58,7 +59,7 @@ type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
|
||||
pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReader,
|
||||
obkv: obkv::KvReaderU16,
|
||||
) -> Result<Map<String, Value>> {
|
||||
displayed_fields
|
||||
.iter()
|
||||
@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
K: fmt::Display,
|
||||
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
|
||||
{
|
||||
let mut key_buffer = vec![field_id];
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
||||
|
||||
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
|
||||
key_buffer.truncate(1);
|
||||
@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
.index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.rtxn, &[field_id])?
|
||||
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
|
||||
.remap_key_type::<FacetValueStringCodec>();
|
||||
|
||||
for result in iter {
|
||||
|
@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> {
|
||||
) -> heed::Result<Option<u8>> {
|
||||
let level = db
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.prefix_iter(rtxn, &[fid][..])?
|
||||
.prefix_iter(rtxn, &fid.to_be_bytes())?
|
||||
.remap_key_type::<FacetLevelValueF64Codec>()
|
||||
.last()
|
||||
.transpose()?
|
||||
|
@ -430,8 +430,10 @@ where
|
||||
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
|
||||
F: Fn(K) -> DocumentId,
|
||||
{
|
||||
let mut iter =
|
||||
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
|
||||
let mut iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
|
||||
.remap_key_type::<C>();
|
||||
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, ()) = result?;
|
||||
|
@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::index_documents::{
|
||||
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
pub struct Facets<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
fn clear_field_number_levels<'t>(
|
||||
wtxn: &'t mut heed::RwTxn,
|
||||
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: u8,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<()> {
|
||||
let left = (field_id, 1, f64::MIN, f64::MIN);
|
||||
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
|
||||
@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>(
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
field_id: u8,
|
||||
field_id: FieldId,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let first_level_size = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &[field_id])?
|
||||
.prefix_iter(rtxn, &field_id.to_be_bytes())?
|
||||
.remap_types::<DecodeIgnore, DecodeIgnore>()
|
||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||
|
||||
@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>(
|
||||
fn compute_faceted_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||
field_id: u8,
|
||||
field_id: FieldId,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
|
||||
for result in db.prefix_iter(rtxn, &[field_id])? {
|
||||
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||
let (_key, docids) = result?;
|
||||
documents_ids |= docids;
|
||||
}
|
||||
@ -210,7 +210,7 @@ fn compute_faceted_documents_ids(
|
||||
|
||||
fn write_number_entry(
|
||||
writer: &mut Writer<File>,
|
||||
field_id: u8,
|
||||
field_id: FieldId,
|
||||
level: u8,
|
||||
left: f64,
|
||||
right: f64,
|
||||
|
@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
Ok(values.first().unwrap().to_vec())
|
||||
}
|
||||
|
||||
pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec<u8>) {
|
||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
|
@ -842,10 +842,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::*;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::HashMap;
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -1352,4 +1354,30 @@ mod tests {
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_more_than_256_fields() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let mut big_object = HashMap::new();
|
||||
big_object.insert(S("id"), "wow");
|
||||
for i in 0..1000 {
|
||||
let key = i.to_string();
|
||||
big_object.insert(key, "I am a text!");
|
||||
}
|
||||
|
||||
let content = vec![big_object];
|
||||
let content = serde_json::to_string(&content).unwrap();
|
||||
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ use std::time::Instant;
|
||||
use std::{cmp, iter};
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use concat_arrays::concat_arrays;
|
||||
use fst::Set;
|
||||
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
||||
use heed::BytesEncode;
|
||||
@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
for ((fid, count), docids) in self.field_id_word_count_docids {
|
||||
docids_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
|
||||
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?;
|
||||
let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]);
|
||||
self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?;
|
||||
}
|
||||
|
||||
let fst = builder.into_set();
|
||||
|
@ -626,7 +626,7 @@ mod test {
|
||||
Some("tata".to_string()),
|
||||
false,
|
||||
);
|
||||
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
|
||||
assert_eq!(result.unwrap(), (0, "toto".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
|
||||
@ -635,7 +635,7 @@ mod test {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result =
|
||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
||||
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
|
||||
assert_eq!(result.unwrap(), (0, "tata".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
|
||||
@ -643,7 +643,7 @@ mod test {
|
||||
fn should_return_default_if_both_are_none() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
||||
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
|
||||
assert_eq!(result.unwrap(), (0, "id".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user