mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 21:34:27 +01:00
Merge #285
285: Support documents with at most 65536 fields r=Kerollmops a=Kerollmops Fixes #248. In this PR I updated the `obkv` crate, it now supports arbitrary key length and therefore I was able to use an `u16` to represent the fields instead of a single byte. It was impressively easy to update the whole codebase 🍡 🍔 Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
4c9531bdf3
16
Cargo.lock
generated
16
Cargo.lock
generated
@ -341,6 +341,17 @@ dependencies = [
|
|||||||
"unicode-width",
|
"unicode-width",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "concat-arrays"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.27",
|
||||||
|
"quote 1.0.9",
|
||||||
|
"syn 1.0.73",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "convert_case"
|
name = "convert_case"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@ -1378,6 +1389,7 @@ dependencies = [
|
|||||||
"bstr",
|
"bstr",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"concat-arrays",
|
||||||
"csv",
|
"csv",
|
||||||
"either",
|
"either",
|
||||||
"flate2",
|
"flate2",
|
||||||
@ -1609,9 +1621,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obkv"
|
name = "obkv"
|
||||||
version = "0.1.1"
|
version = "0.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8"
|
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
|
@ -7,7 +7,7 @@ use byte_unit::Byte;
|
|||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use milli::facet::FacetType;
|
use milli::facet::FacetType;
|
||||||
use milli::index::db_name::*;
|
use milli::index::db_name::*;
|
||||||
use milli::{Index, TreeLevel};
|
use milli::{FieldId, Index, TreeLevel};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use Command::*;
|
use Command::*;
|
||||||
|
|
||||||
@ -322,7 +322,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
|
|||||||
fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
|
fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
|
||||||
rtxn: &'txn heed::RoTxn,
|
rtxn: &'txn heed::RoTxn,
|
||||||
db: heed::Database<KC, DC>,
|
db: heed::Database<KC, DC>,
|
||||||
field_id: u8,
|
field_id: FieldId,
|
||||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
|
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
|
||||||
where
|
where
|
||||||
KC: heed::BytesDecode<'txn>,
|
KC: heed::BytesDecode<'txn>,
|
||||||
@ -330,7 +330,7 @@ where
|
|||||||
{
|
{
|
||||||
let iter = db
|
let iter = db
|
||||||
.remap_key_type::<heed::types::ByteSlice>()
|
.remap_key_type::<heed::types::ByteSlice>()
|
||||||
.prefix_iter(&rtxn, &[field_id])?
|
.prefix_iter(&rtxn, &field_id.to_be_bytes())?
|
||||||
.remap_key_type::<KC>();
|
.remap_key_type::<KC>();
|
||||||
|
|
||||||
Ok(Box::new(iter))
|
Ok(Box::new(iter))
|
||||||
|
@ -8,6 +8,7 @@ edition = "2018"
|
|||||||
bstr = "0.2.15"
|
bstr = "0.2.15"
|
||||||
byteorder = "1.4.2"
|
byteorder = "1.4.2"
|
||||||
chrono = { version = "0.4.19", features = ["serde"] }
|
chrono = { version = "0.4.19", features = ["serde"] }
|
||||||
|
concat-arrays = "0.1.2"
|
||||||
csv = "1.1.5"
|
csv = "1.1.5"
|
||||||
either = "1.6.1"
|
either = "1.6.1"
|
||||||
flate2 = "1.0.20"
|
flate2 = "1.0.20"
|
||||||
@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
|||||||
linked-hash-map = "0.5.4"
|
linked-hash-map = "0.5.4"
|
||||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
|
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
obkv = "0.1.1"
|
obkv = "0.2.0"
|
||||||
once_cell = "1.5.2"
|
once_cell = "1.5.2"
|
||||||
ordered-float = "2.1.1"
|
ordered-float = "2.1.1"
|
||||||
rayon = "1.5.0"
|
rayon = "1.5.0"
|
||||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
|
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::FieldId;
|
use crate::{try_split_array_at, FieldId};
|
||||||
|
|
||||||
// TODO do not de/serialize right bound when level = 0
|
// TODO do not de/serialize right bound when level = 0
|
||||||
pub struct FacetLevelValueF64Codec;
|
pub struct FacetLevelValueF64Codec;
|
||||||
@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
|
|||||||
type DItem = (FieldId, u8, f64, f64);
|
type DItem = (FieldId, u8, f64, f64);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (field_id, bytes) = bytes.split_first()?;
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
let (level, bytes) = bytes.split_first()?;
|
let (level, bytes) = bytes.split_first()?;
|
||||||
|
|
||||||
let (left, right) = if *level != 0 {
|
let (left, right) = if *level != 0 {
|
||||||
@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
|
|||||||
(left, left)
|
(left, left)
|
||||||
};
|
};
|
||||||
|
|
||||||
Some((*field_id, *level, left, right))
|
Some((field_id, *level, left, right))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
|
|||||||
16 // length
|
16 // length
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut bytes = Vec::with_capacity(len + 2);
|
let mut bytes = Vec::with_capacity(len + 3);
|
||||||
bytes.push(*field_id);
|
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||||
bytes.push(*level);
|
bytes.push(*level);
|
||||||
bytes.extend_from_slice(&buffer[..len]);
|
bytes.extend_from_slice(&buffer[..len]);
|
||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
use crate::FieldId;
|
use crate::{try_split_array_at, FieldId};
|
||||||
|
|
||||||
pub struct FacetValueStringCodec;
|
pub struct FacetValueStringCodec;
|
||||||
|
|
||||||
impl FacetValueStringCodec {
|
impl FacetValueStringCodec {
|
||||||
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
|
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
|
||||||
out.reserve(value.len() + 1);
|
out.reserve(value.len() + 2);
|
||||||
out.push(field_id);
|
out.extend_from_slice(&field_id.to_be_bytes());
|
||||||
out.extend_from_slice(value.as_bytes());
|
out.extend_from_slice(value.as_bytes());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
|
|||||||
type DItem = (FieldId, &'a str);
|
type DItem = (FieldId, &'a str);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (field_id, bytes) = bytes.split_first()?;
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
let value = str::from_utf8(bytes).ok()?;
|
let value = str::from_utf8(bytes).ok()?;
|
||||||
Some((*field_id, value))
|
Some((field_id, value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
|
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::{DocumentId, FieldId};
|
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||||
|
|
||||||
pub struct FieldDocIdFacetF64Codec;
|
pub struct FieldDocIdFacetF64Codec;
|
||||||
|
|
||||||
@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
|
|||||||
type DItem = (FieldId, DocumentId, f64);
|
type DItem = (FieldId, DocumentId, f64);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (field_id, bytes) = bytes.split_first()?;
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
let (document_id_bytes, bytes) = bytes.split_at(4);
|
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
|
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
|
||||||
|
|
||||||
Some((*field_id, document_id, value))
|
Some((field_id, document_id, value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
|
|||||||
type EItem = (FieldId, DocumentId, f64);
|
type EItem = (FieldId, DocumentId, f64);
|
||||||
|
|
||||||
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8);
|
let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8);
|
||||||
bytes.push(*field_id);
|
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||||
bytes.extend_from_slice(&document_id.to_be_bytes());
|
bytes.extend_from_slice(&document_id.to_be_bytes());
|
||||||
let value_bytes = f64_into_bytes(*value)?;
|
let value_bytes = f64_into_bytes(*value)?;
|
||||||
bytes.extend_from_slice(&value_bytes);
|
bytes.extend_from_slice(&value_bytes);
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::convert::TryInto;
|
|
||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
use crate::{DocumentId, FieldId};
|
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||||
|
|
||||||
pub struct FieldDocIdFacetStringCodec;
|
pub struct FieldDocIdFacetStringCodec;
|
||||||
|
|
||||||
@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec {
|
|||||||
value: &str,
|
value: &str,
|
||||||
out: &mut Vec<u8>,
|
out: &mut Vec<u8>,
|
||||||
) {
|
) {
|
||||||
out.reserve(1 + 4 + value.len());
|
out.reserve(2 + 4 + value.len());
|
||||||
out.push(field_id);
|
out.extend_from_slice(&field_id.to_be_bytes());
|
||||||
out.extend_from_slice(&document_id.to_be_bytes());
|
out.extend_from_slice(&document_id.to_be_bytes());
|
||||||
out.extend_from_slice(value.as_bytes());
|
out.extend_from_slice(value.as_bytes());
|
||||||
}
|
}
|
||||||
@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
|
|||||||
type DItem = (FieldId, DocumentId, &'a str);
|
type DItem = (FieldId, DocumentId, &'a str);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (field_id, bytes) = bytes.split_first()?;
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
let (document_id_bytes, bytes) = bytes.split_at(4);
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
|
||||||
|
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let value = str::from_utf8(bytes).ok()?;
|
let value = str::from_utf8(bytes).ok()?;
|
||||||
Some((*field_id, document_id, value))
|
Some((field_id, document_id, value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::convert::TryInto;
|
|
||||||
|
|
||||||
use crate::FieldId;
|
use crate::{try_split_array_at, FieldId};
|
||||||
|
|
||||||
pub struct FieldIdWordCountCodec;
|
pub struct FieldIdWordCountCodec;
|
||||||
|
|
||||||
@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
|||||||
type DItem = (FieldId, u8);
|
type DItem = (FieldId, u8);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?;
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
|
let ([word_count], _nothing) = try_split_array_at(bytes)?;
|
||||||
Some((field_id, word_count))
|
Some((field_id, word_count))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
|||||||
type EItem = (FieldId, u8);
|
type EItem = (FieldId, u8);
|
||||||
|
|
||||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
Some(Cow::Owned(vec![*field_id, *word_count]))
|
let mut bytes = Vec::with_capacity(2 + 1);
|
||||||
|
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
bytes.push(*word_count);
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,19 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReaderU16, KvWriterU16};
|
||||||
|
|
||||||
pub struct ObkvCodec;
|
pub struct ObkvCodec;
|
||||||
|
|
||||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||||
type DItem = KvReader<'a>;
|
type DItem = KvReaderU16<'a>;
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
Some(KvReader::new(bytes))
|
Some(KvReaderU16::new(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl heed::BytesEncode<'_> for ObkvCodec {
|
impl heed::BytesEncode<'_> for ObkvCodec {
|
||||||
type EItem = KvWriter<Vec<u8>>;
|
type EItem = KvWriterU16<Vec<u8>>;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
item.clone().into_inner().map(Cow::Owned).ok()
|
item.clone().into_inner().map(Cow::Owned).ok()
|
||||||
|
@ -523,10 +523,11 @@ impl Index {
|
|||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
docids: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
*buffer.last_mut().unwrap() = field_id;
|
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||||
|
.copy_from_slice(&field_id.to_be_bytes());
|
||||||
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -536,10 +537,11 @@ impl Index {
|
|||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
) -> heed::Result<RoaringBitmap> {
|
) -> heed::Result<RoaringBitmap> {
|
||||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
*buffer.last_mut().unwrap() = field_id;
|
buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||||
|
.copy_from_slice(&field_id.to_be_bytes());
|
||||||
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||||
Some(docids) => Ok(docids),
|
Some(docids) => Ok(docids),
|
||||||
None => Ok(RoaringBitmap::new()),
|
None => Ok(RoaringBitmap::new()),
|
||||||
@ -553,10 +555,11 @@ impl Index {
|
|||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
docids: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2];
|
||||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
*buffer.last_mut().unwrap() = field_id;
|
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||||
|
.copy_from_slice(&field_id.to_be_bytes());
|
||||||
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -569,7 +572,8 @@ impl Index {
|
|||||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||||
*buffer.last_mut().unwrap() = field_id;
|
buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..]
|
||||||
|
.copy_from_slice(&field_id.to_be_bytes());
|
||||||
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||||
Some(docids) => Ok(docids),
|
Some(docids) => Ok(docids),
|
||||||
None => Ok(RoaringBitmap::new()),
|
None => Ok(RoaringBitmap::new()),
|
||||||
@ -723,7 +727,7 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
rtxn: &'t RoTxn,
|
rtxn: &'t RoTxn,
|
||||||
ids: impl IntoIterator<Item = DocumentId>,
|
ids: impl IntoIterator<Item = DocumentId>,
|
||||||
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
|
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
for id in ids {
|
for id in ids {
|
||||||
@ -741,7 +745,7 @@ impl Index {
|
|||||||
pub fn all_documents<'t>(
|
pub fn all_documents<'t>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &'t RoTxn,
|
rtxn: &'t RoTxn,
|
||||||
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReader<'t>)>>> {
|
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.documents
|
.documents
|
||||||
.iter(rtxn)?
|
.iter(rtxn)?
|
||||||
|
@ -15,6 +15,7 @@ pub mod update;
|
|||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
|
use std::convert::{TryFrom, TryInto};
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
|||||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||||
pub type Attribute = u32;
|
pub type Attribute = u32;
|
||||||
pub type DocumentId = u32;
|
pub type DocumentId = u32;
|
||||||
pub type FieldId = u8;
|
pub type FieldId = u16;
|
||||||
pub type Position = u32;
|
pub type Position = u32;
|
||||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||||
|
|
||||||
@ -58,7 +59,7 @@ type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
|
|||||||
pub fn obkv_to_json(
|
pub fn obkv_to_json(
|
||||||
displayed_fields: &[FieldId],
|
displayed_fields: &[FieldId],
|
||||||
fields_ids_map: &FieldsIdsMap,
|
fields_ids_map: &FieldsIdsMap,
|
||||||
obkv: obkv::KvReader,
|
obkv: obkv::KvReaderU16,
|
||||||
) -> Result<Map<String, Value>> {
|
) -> Result<Map<String, Value>> {
|
||||||
displayed_fields
|
displayed_fields
|
||||||
.iter()
|
.iter()
|
||||||
@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option<String> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||||
|
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||||
|
if mid <= slice.len() {
|
||||||
|
Some(slice.split_at(mid))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Divides one slice into an array and the tail at an index,
|
||||||
|
/// returns `None` if `N` is out of bounds.
|
||||||
|
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||||
|
where
|
||||||
|
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||||
|
{
|
||||||
|
let (head, tail) = try_split_at(slice, N)?;
|
||||||
|
let head = head.try_into().ok()?;
|
||||||
|
Some((head, tail))
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
K: fmt::Display,
|
K: fmt::Display,
|
||||||
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
|
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
|
||||||
{
|
{
|
||||||
let mut key_buffer = vec![field_id];
|
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
||||||
|
|
||||||
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
|
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
|
||||||
key_buffer.truncate(1);
|
key_buffer.truncate(1);
|
||||||
@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
.index
|
.index
|
||||||
.facet_id_string_docids
|
.facet_id_string_docids
|
||||||
.remap_key_type::<ByteSlice>()
|
.remap_key_type::<ByteSlice>()
|
||||||
.prefix_iter(self.rtxn, &[field_id])?
|
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
|
||||||
.remap_key_type::<FacetValueStringCodec>();
|
.remap_key_type::<FacetValueStringCodec>();
|
||||||
|
|
||||||
for result in iter {
|
for result in iter {
|
||||||
|
@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> {
|
|||||||
) -> heed::Result<Option<u8>> {
|
) -> heed::Result<Option<u8>> {
|
||||||
let level = db
|
let level = db
|
||||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||||
.prefix_iter(rtxn, &[fid][..])?
|
.prefix_iter(rtxn, &fid.to_be_bytes())?
|
||||||
.remap_key_type::<FacetLevelValueF64Codec>()
|
.remap_key_type::<FacetLevelValueF64Codec>()
|
||||||
.last()
|
.last()
|
||||||
.transpose()?
|
.transpose()?
|
||||||
|
@ -430,8 +430,10 @@ where
|
|||||||
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
|
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
|
||||||
F: Fn(K) -> DocumentId,
|
F: Fn(K) -> DocumentId,
|
||||||
{
|
{
|
||||||
let mut iter =
|
let mut iter = db
|
||||||
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
|
.remap_key_type::<ByteSlice>()
|
||||||
|
.prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
|
||||||
|
.remap_key_type::<C>();
|
||||||
|
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (key, ()) = result?;
|
let (key, ()) = result?;
|
||||||
|
@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
|
|||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
|
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result};
|
use crate::{FieldId, Index, Result};
|
||||||
|
|
||||||
pub struct Facets<'t, 'u, 'i> {
|
pub struct Facets<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
|||||||
fn clear_field_number_levels<'t>(
|
fn clear_field_number_levels<'t>(
|
||||||
wtxn: &'t mut heed::RwTxn,
|
wtxn: &'t mut heed::RwTxn,
|
||||||
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||||
field_id: u8,
|
field_id: FieldId,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
let left = (field_id, 1, f64::MIN, f64::MIN);
|
let left = (field_id, 1, f64::MIN, f64::MIN);
|
||||||
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
|
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
|
||||||
@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>(
|
|||||||
shrink_size: Option<u64>,
|
shrink_size: Option<u64>,
|
||||||
level_group_size: NonZeroUsize,
|
level_group_size: NonZeroUsize,
|
||||||
min_level_size: NonZeroUsize,
|
min_level_size: NonZeroUsize,
|
||||||
field_id: u8,
|
field_id: FieldId,
|
||||||
) -> Result<Reader<FileFuse>> {
|
) -> Result<Reader<FileFuse>> {
|
||||||
let first_level_size = db
|
let first_level_size = db
|
||||||
.remap_key_type::<ByteSlice>()
|
.remap_key_type::<ByteSlice>()
|
||||||
.prefix_iter(rtxn, &[field_id])?
|
.prefix_iter(rtxn, &field_id.to_be_bytes())?
|
||||||
.remap_types::<DecodeIgnore, DecodeIgnore>()
|
.remap_types::<DecodeIgnore, DecodeIgnore>()
|
||||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||||
|
|
||||||
@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>(
|
|||||||
fn compute_faceted_documents_ids(
|
fn compute_faceted_documents_ids(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
field_id: u8,
|
field_id: FieldId,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
|
||||||
for result in db.prefix_iter(rtxn, &[field_id])? {
|
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||||
let (_key, docids) = result?;
|
let (_key, docids) = result?;
|
||||||
documents_ids |= docids;
|
documents_ids |= docids;
|
||||||
}
|
}
|
||||||
@ -210,7 +210,7 @@ fn compute_faceted_documents_ids(
|
|||||||
|
|
||||||
fn write_number_entry(
|
fn write_number_entry(
|
||||||
writer: &mut Writer<File>,
|
writer: &mut Writer<File>,
|
||||||
field_id: u8,
|
field_id: FieldId,
|
||||||
level: u8,
|
level: u8,
|
||||||
left: f64,
|
left: f64,
|
||||||
right: f64,
|
right: f64,
|
||||||
|
@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
|||||||
Ok(values.first().unwrap().to_vec())
|
Ok(values.first().unwrap().to_vec())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec<u8>) {
|
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
@ -842,10 +842,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
use big_s::S;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::update::DeleteDocuments;
|
use crate::update::DeleteDocuments;
|
||||||
|
use crate::HashMap;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_document_replacement() {
|
fn simple_document_replacement() {
|
||||||
@ -1352,4 +1354,30 @@ mod tests {
|
|||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_more_than_256_fields() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let mut big_object = HashMap::new();
|
||||||
|
big_object.insert(S("id"), "wow");
|
||||||
|
for i in 0..1000 {
|
||||||
|
let key = i.to_string();
|
||||||
|
big_object.insert(key, "I am a text!");
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = vec![big_object];
|
||||||
|
let content = serde_json::to_string(&content).unwrap();
|
||||||
|
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
|
builder.update_format(UpdateFormat::Json);
|
||||||
|
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ use std::time::Instant;
|
|||||||
use std::{cmp, iter};
|
use std::{cmp, iter};
|
||||||
|
|
||||||
use bstr::ByteSlice as _;
|
use bstr::ByteSlice as _;
|
||||||
|
use concat_arrays::concat_arrays;
|
||||||
use fst::Set;
|
use fst::Set;
|
||||||
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
for ((fid, count), docids) in self.field_id_word_count_docids {
|
for ((fid, count), docids) in self.field_id_word_count_docids {
|
||||||
docids_buffer.clear();
|
docids_buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
|
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
|
||||||
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?;
|
let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]);
|
||||||
|
self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let fst = builder.into_set();
|
let fst = builder.into_set();
|
||||||
|
@ -626,7 +626,7 @@ mod test {
|
|||||||
Some("tata".to_string()),
|
Some("tata".to_string()),
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
|
assert_eq!(result.unwrap(), (0, "toto".to_string()));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -635,7 +635,7 @@ mod test {
|
|||||||
let mut fields_map = FieldsIdsMap::new();
|
let mut fields_map = FieldsIdsMap::new();
|
||||||
let result =
|
let result =
|
||||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
||||||
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
|
assert_eq!(result.unwrap(), (0, "tata".to_string()));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -643,7 +643,7 @@ mod test {
|
|||||||
fn should_return_default_if_both_are_none() {
|
fn should_return_default_if_both_are_none() {
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
let mut fields_map = FieldsIdsMap::new();
|
||||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
||||||
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
|
assert_eq!(result.unwrap(), (0, "id".to_string()));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user