5145: Use bumparaw-collections in Meilisearch/milli r=dureuill a=Kerollmops

This PR is related to #5078. It uses the now published bumparaw-collections and (soon) makes the `RawMap` hasher nonrandom.

Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-12-10 15:51:01 +00:00 committed by GitHub
commit e974be9518
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 124 additions and 93 deletions

33
Cargo.lock generated
View File

@ -706,6 +706,20 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "bumparaw-collections"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ce682bdc86c2e25ef5cd95881d9d6a1902214eddf74cf9ffea88fe1464377e8"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]] [[package]]
name = "byte-unit" name = "byte-unit"
version = "5.1.4" version = "5.1.4"
@ -2617,6 +2631,7 @@ dependencies = [
"big_s", "big_s",
"bincode", "bincode",
"bumpalo", "bumpalo",
"bumparaw-collections",
"crossbeam-channel", "crossbeam-channel",
"csv", "csv",
"derive_builder 0.20.0", "derive_builder 0.20.0",
@ -2631,7 +2646,6 @@ dependencies = [
"meilisearch-types", "meilisearch-types",
"memmap2", "memmap2",
"page_size", "page_size",
"raw-collections",
"rayon", "rayon",
"roaring", "roaring",
"serde", "serde",
@ -3549,6 +3563,7 @@ dependencies = [
"actix-web", "actix-web",
"anyhow", "anyhow",
"bumpalo", "bumpalo",
"bumparaw-collections",
"convert_case 0.6.0", "convert_case 0.6.0",
"csv", "csv",
"deserr", "deserr",
@ -3561,7 +3576,6 @@ dependencies = [
"meili-snap", "meili-snap",
"memmap2", "memmap2",
"milli", "milli",
"raw-collections",
"roaring", "roaring",
"serde", "serde",
"serde-cs", "serde-cs",
@ -3618,6 +3632,7 @@ dependencies = [
"bincode", "bincode",
"bstr", "bstr",
"bumpalo", "bumpalo",
"bumparaw-collections",
"bytemuck", "bytemuck",
"byteorder", "byteorder",
"candle-core", "candle-core",
@ -3656,7 +3671,6 @@ dependencies = [
"once_cell", "once_cell",
"ordered-float", "ordered-float",
"rand", "rand",
"raw-collections",
"rayon", "rayon",
"rayon-par-bridge", "rayon-par-bridge",
"rhai", "rhai",
@ -4487,19 +4501,6 @@ dependencies = [
"rand", "rand",
] ]
[[package]]
name = "raw-collections"
version = "0.1.0"
source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]] [[package]]
name = "raw-cpuid" name = "raw-cpuid"
version = "10.7.0" version = "10.7.0"

View File

@ -13,6 +13,8 @@ license.workspace = true
[dependencies] [dependencies]
anyhow = "1.0.86" anyhow = "1.0.86"
bincode = "1.3.3" bincode = "1.3.3"
bumpalo = "3.16.0"
bumparaw-collections = "0.1.2"
csv = "1.3.0" csv = "1.3.0"
derive_builder = "0.20.0" derive_builder = "0.20.0"
dump = { path = "../dump" } dump = { path = "../dump" }
@ -21,8 +23,8 @@ file-store = { path = "../file-store" }
flate2 = "1.0.30" flate2 = "1.0.30"
meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" } meilisearch-types = { path = "../meilisearch-types" }
memmap2 = "0.9.4"
page_size = "0.6.0" page_size = "0.6.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
rayon = "1.10.0" rayon = "1.10.0"
roaring = { version = "0.10.7", features = ["serde"] } roaring = { version = "0.10.7", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }
@ -30,7 +32,6 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] }
synchronoise = "1.0.1" synchronoise = "1.0.1"
tempfile = "3.10.1" tempfile = "3.10.1"
thiserror = "1.0.61" thiserror = "1.0.61"
memmap2 = "0.9.4"
time = { version = "0.3.36", features = [ time = { version = "0.3.36", features = [
"serde-well-known", "serde-well-known",
"formatting", "formatting",
@ -40,7 +41,6 @@ time = { version = "0.3.36", features = [
tracing = "0.1.40" tracing = "0.1.40"
ureq = "2.10.0" ureq = "2.10.0"
uuid = { version = "1.10.0", features = ["serde", "v4"] } uuid = { version = "1.10.0", features = ["serde", "v4"] }
bumpalo = "3.16.0"
[dev-dependencies] [dev-dependencies]
arroy = "0.5.0" arroy = "0.5.0"

View File

@ -24,7 +24,7 @@ flate2 = "1.0.30"
fst = "0.4.7" fst = "0.4.7"
memmap2 = "0.9.4" memmap2 = "0.9.4"
milli = { path = "../milli" } milli = { path = "../milli" }
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } bumparaw-collections = "0.1.2"
roaring = { version = "0.10.7", features = ["serde"] } roaring = { version = "0.10.7", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }
serde-cs = "0.2.4" serde-cs = "0.2.4"

View File

@ -4,10 +4,10 @@ use std::io::{self, BufWriter};
use std::marker::PhantomData; use std::marker::PhantomData;
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawMap;
use memmap2::Mmap; use memmap2::Mmap;
use milli::documents::Error; use milli::documents::Error;
use milli::Object; use milli::Object;
use raw_collections::RawMap;
use serde::de::{SeqAccess, Visitor}; use serde::de::{SeqAccess, Visitor};
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use serde_json::error::Category; use serde_json::error::Category;

View File

@ -91,8 +91,8 @@ ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2" url = "2.5.2"
rayon-par-bridge = "0.1.0" rayon-par-bridge = "0.1.0"
hashbrown = "0.15.0" hashbrown = "0.15.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
bumpalo = "3.16.0" bumpalo = "3.16.0"
bumparaw-collections = "0.1.2"
thread_local = "1.1.8" thread_local = "1.1.8"
allocator-api2 = "0.2.18" allocator-api2 = "0.2.18"
rustc-hash = "2.0.0" rustc-hash = "2.0.0"

View File

@ -3,12 +3,13 @@ use std::collections::BTreeMap;
use std::fmt::{self, Debug}; use std::fmt::{self, Debug};
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::{RawMap, RawVec, Value};
use liquid::model::{ use liquid::model::{
ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State,
Value as LiquidValue, Value as LiquidValue,
}; };
use liquid::{ObjectView, ValueView}; use liquid::{ObjectView, ValueView};
use raw_collections::{RawMap, RawVec}; use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd};
@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
} }
impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> {
fn as_debug(&self) -> &dyn fmt::Debug { fn as_debug(&self) -> &dyn Debug {
self self
} }
fn render(&self) -> liquid::model::DisplayCow<'_> { fn render(&self) -> liquid::model::DisplayCow<'_> {
@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc,
} }
} }
#[derive(Debug)]
struct ParseableValue<'doc> { struct ParseableValue<'doc> {
value: raw_collections::Value<'doc>, value: Value<'doc, FxBuildHasher>,
} }
impl<'doc> ParseableValue<'doc> { impl<'doc> ParseableValue<'doc> {
pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self {
let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap();
Self { value } Self { value }
} }
@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> {
} }
// transparent newtype for implementing ValueView // transparent newtype for implementing ValueView
#[repr(transparent)]
#[derive(Debug)] #[derive(Debug)]
struct ParseableMap<'doc>(RawMap<'doc>); #[repr(transparent)]
struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>);
// transparent newtype for implementing ValueView // transparent newtype for implementing ValueView
#[repr(transparent)]
#[derive(Debug)] #[derive(Debug)]
#[repr(transparent)]
struct ParseableArray<'doc>(RawVec<'doc>); struct ParseableArray<'doc>(RawVec<'doc>);
impl<'doc> ParseableMap<'doc> { impl<'doc> ParseableMap<'doc> {
pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> {
// SAFETY: repr(transparent) // SAFETY: repr(transparent)
unsafe { &*(map as *const RawMap as *const Self) } unsafe { &*(map as *const RawMap<FxBuildHasher> as *const Self) }
} }
} }
@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn render(&self) -> DisplayCow<'_> { fn render(&self) -> DisplayCow<'_> {
use raw_collections::value::Number; use bumparaw_collections::value::Number;
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => LiquidValue::Nil.render(), Value::Null => LiquidValue::Nil.render(),
Value::Bool(v) => v.render(), Value::Bool(v) => v.render(),
@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn source(&self) -> DisplayCow<'_> { fn source(&self) -> DisplayCow<'_> {
use raw_collections::value::Number; use bumparaw_collections::value::Number;
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => LiquidValue::Nil.source(), Value::Null => LiquidValue::Nil.source(),
Value::Bool(v) => ValueView::source(v), Value::Bool(v) => ValueView::source(v),
@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn type_name(&self) -> &'static str { fn type_name(&self) -> &'static str {
use raw_collections::value::Number; use bumparaw_collections::value::Number;
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => LiquidValue::Nil.type_name(), Value::Null => LiquidValue::Nil.type_name(),
Value::Bool(v) => v.type_name(), Value::Bool(v) => v.type_name(),
@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn query_state(&self, state: State) -> bool { fn query_state(&self, state: State) -> bool {
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => ValueView::query_state(&LiquidValue::Nil, state), Value::Null => ValueView::query_state(&LiquidValue::Nil, state),
Value::Bool(v) => ValueView::query_state(v, state), Value::Bool(v) => ValueView::query_state(v, state),
@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn to_kstr(&self) -> KStringCow<'_> { fn to_kstr(&self) -> KStringCow<'_> {
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Null => ValueView::to_kstr(&LiquidValue::Nil),
Value::Bool(v) => ValueView::to_kstr(v), Value::Bool(v) => ValueView::to_kstr(v),
@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn to_value(&self) -> LiquidValue { fn to_value(&self) -> LiquidValue {
use raw_collections::Value; use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Null => LiquidValue::Nil, Value::Null => LiquidValue::Nil,
Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number { Value::Number(number) => match number {
raw_collections::value::Number::PosInt(number) => { Number::PosInt(number) => {
let number: i64 = match (*number).try_into() { let number: i64 = match (*number).try_into() {
Ok(number) => number, Ok(number) => number,
Err(_) => { Err(_) => {
@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}; };
LiquidValue::Scalar(ScalarCow::new(number)) LiquidValue::Scalar(ScalarCow::new(number))
} }
raw_collections::value::Number::NegInt(number) => { Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
LiquidValue::Scalar(ScalarCow::new(*number)) Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
}
raw_collections::value::Number::Finite(number) => {
LiquidValue::Scalar(ScalarCow::new(*number))
}
}, },
Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())),
Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(),
@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> { fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> {
use raw_collections::value::Number; use bumparaw_collections::value::Number;
use raw_collections::Value; use bumparaw_collections::Value;
match &self.value { match &self.value {
Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number { Value::Number(number) => match number {
@ -576,34 +580,41 @@ impl<'doc> ValueView for ParseableValue<'doc> {
} }
fn is_scalar(&self) -> bool { fn is_scalar(&self) -> bool {
use raw_collections::Value; use bumparaw_collections::Value;
matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_))
} }
fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> {
if let raw_collections::Value::Array(array) = &self.value { if let Value::Array(array) = &self.value {
return Some(ParseableArray::as_parseable(array) as _); return Some(ParseableArray::as_parseable(array) as _);
} }
None None
} }
fn is_array(&self) -> bool { fn is_array(&self) -> bool {
matches!(&self.value, raw_collections::Value::Array(_)) matches!(&self.value, bumparaw_collections::Value::Array(_))
} }
fn as_object(&self) -> Option<&dyn ObjectView> { fn as_object(&self) -> Option<&dyn ObjectView> {
if let raw_collections::Value::Object(object) = &self.value { if let Value::Object(object) = &self.value {
return Some(ParseableMap::as_parseable(object) as _); return Some(ParseableMap::as_parseable(object) as _);
} }
None None
} }
fn is_object(&self) -> bool { fn is_object(&self) -> bool {
matches!(&self.value, raw_collections::Value::Object(_)) matches!(&self.value, bumparaw_collections::Value::Object(_))
} }
fn is_nil(&self) -> bool { fn is_nil(&self) -> bool {
matches!(&self.value, raw_collections::Value::Null) matches!(&self.value, bumparaw_collections::Value::Null)
}
}
impl Debug for ParseableValue<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ParseableValue").field("value", &self.value).finish()
} }
} }

View File

@ -1,7 +1,8 @@
use std::collections::{BTreeMap, BTreeSet}; use std::collections::{BTreeMap, BTreeSet};
use bumparaw_collections::RawMap;
use heed::RoTxn; use heed::RoTxn;
use raw_collections::RawMap; use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use super::vector_document::VectorDocument; use super::vector_document::VectorDocument;
@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue);
#[derive(Debug)] #[derive(Debug)]
pub struct Versions<'doc> { pub struct Versions<'doc> {
data: RawMap<'doc>, data: RawMap<'doc, FxBuildHasher>,
} }
impl<'doc> Versions<'doc> { impl<'doc> Versions<'doc> {
pub fn multiple( pub fn multiple(
mut versions: impl Iterator<Item = Result<RawMap<'doc>>>, mut versions: impl Iterator<Item = Result<RawMap<'doc, FxBuildHasher>>>,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let Some(data) = versions.next() else { return Ok(None) }; let Some(data) = versions.next() else { return Ok(None) };
let mut data = data?; let mut data = data?;
@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> {
Ok(Some(Self::single(data))) Ok(Some(Self::single(data)))
} }
pub fn single(version: RawMap<'doc>) -> Self { pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self {
Self { data: version } Self { data: version }
} }

View File

@ -69,12 +69,12 @@ use std::io::BufReader;
use std::{io, iter, mem}; use std::{io, iter, mem};
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::bbbul::{BitPacker, BitPacker4x};
use bumparaw_collections::map::FrozenMap;
use bumparaw_collections::{Bbbul, FrozenBbbul};
use grenad::ReaderCursor; use grenad::ReaderCursor;
use hashbrown::hash_map::RawEntryMut; use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap; use hashbrown::HashMap;
use raw_collections::bbbul::{BitPacker, BitPacker4x};
use raw_collections::map::FrozenMap;
use raw_collections::{Bbbul, FrozenBbbul};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher; use rustc_hash::FxBuildHasher;

View File

@ -176,9 +176,10 @@ pub fn tokenizer_builder<'a>(
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawMap;
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
use meili_snap::snapshot; use meili_snap::snapshot;
use raw_collections::RawMap; use rustc_hash::FxBuildHasher;
use serde_json::json; use serde_json::json;
use serde_json::value::RawValue; use serde_json::value::RawValue;
@ -234,7 +235,7 @@ mod test {
let bump = Bump::new(); let bump = Bump::new();
let document: &RawValue = serde_json::from_str(&document).unwrap(); let document: &RawValue = serde_json::from_str(&document).unwrap();
let document = RawMap::from_raw_value(document, &bump).unwrap(); let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap();
let document = Versions::single(document); let document = Versions::single(document);
let document = DocumentFromVersions::new(&document); let document = DocumentFromVersions::new(&document);

View File

@ -1,6 +1,8 @@
use std::ops::ControlFlow; use std::ops::ControlFlow;
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawVec;
use rustc_hash::FxBuildHasher;
use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde::de::{DeserializeSeed, Deserializer as _, Visitor};
use serde_json::value::RawValue; use serde_json::value::RawValue;
@ -360,7 +362,7 @@ impl<'a> DeserrRawValue<'a> {
} }
pub struct DeserrRawVec<'a> { pub struct DeserrRawVec<'a> {
vec: raw_collections::RawVec<'a>, vec: RawVec<'a>,
alloc: &'a Bump, alloc: &'a Bump,
} }
@ -379,7 +381,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> {
} }
pub struct DeserrRawVecIter<'a> { pub struct DeserrRawVecIter<'a> {
it: raw_collections::vec::iter::IntoIter<'a>, it: bumparaw_collections::vec::iter::IntoIter<'a>,
alloc: &'a Bump, alloc: &'a Bump,
} }
@ -393,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> {
} }
pub struct DeserrRawMap<'a> { pub struct DeserrRawMap<'a> {
map: raw_collections::RawMap<'a>, map: bumparaw_collections::RawMap<'a, FxBuildHasher>,
alloc: &'a Bump, alloc: &'a Bump,
} }
@ -416,7 +418,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> {
} }
pub struct DeserrRawMapIter<'a> { pub struct DeserrRawMapIter<'a> {
it: raw_collections::map::iter::IntoIter<'a>, it: bumparaw_collections::map::iter::IntoIter<'a>,
alloc: &'a Bump, alloc: &'a Bump,
} }
@ -615,7 +617,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> {
where where
A: serde::de::SeqAccess<'de>, A: serde::de::SeqAccess<'de>,
{ {
let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); let mut raw_vec = RawVec::new_in(self.alloc);
while let Some(next) = seq.next_element()? { while let Some(next) = seq.next_element()? {
raw_vec.push(next); raw_vec.push(next);
} }

View File

@ -1,10 +1,11 @@
use bumpalo::collections::CollectIn; use bumpalo::collections::CollectIn;
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawMap;
use hashbrown::hash_map::Entry; use hashbrown::hash_map::Entry;
use heed::RoTxn; use heed::RoTxn;
use memmap2::Mmap; use memmap2::Mmap;
use raw_collections::RawMap;
use rayon::slice::ParallelSlice; use rayon::slice::ParallelSlice;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use serde_json::Deserializer; use serde_json::Deserializer;
@ -166,8 +167,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
// Only guess the primary key if it is the first document // Only guess the primary key if it is the first document
let retrieved_primary_key = if previous_offset == 0 { let retrieved_primary_key = if previous_offset == 0 {
let doc = let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer)
RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; .map(Some)
.map_err(UserError::SerdeJson)?;
let result = retrieve_or_guess_primary_key( let result = retrieve_or_guess_primary_key(
rtxn, rtxn,
@ -545,8 +547,9 @@ impl MergeChanges for MergeDocumentForReplacement {
match operations.last() { match operations.last() {
Some(InnerDocOp::Addition(DocumentOffset { content })) => { Some(InnerDocOp::Addition(DocumentOffset { content })) => {
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) let document =
.map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
if is_new { if is_new {
Ok(Some(DocumentChange::Insertion(Insertion::create( Ok(Some(DocumentChange::Insertion(Insertion::create(
@ -632,8 +635,9 @@ impl MergeChanges for MergeDocumentForUpdates {
} }
}; };
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) let document =
.map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
Some(Versions::single(document)) Some(Versions::single(document))
} }
@ -647,8 +651,9 @@ impl MergeChanges for MergeDocumentForUpdates {
}; };
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) let document =
.map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
Ok(document) Ok(document)
}); });
Versions::multiple(versions)? Versions::multiple(versions)?

View File

@ -4,6 +4,7 @@ use std::sync::{OnceLock, RwLock};
use std::thread::{self, Builder}; use std::thread::{self, Builder};
use big_s::S; use big_s::S;
use bumparaw_collections::RawMap;
use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
pub use document_deletion::DocumentDeletion; pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats}; pub use document_operation::{DocumentOperation, PayloadStats};
@ -13,7 +14,7 @@ use heed::{RoTxn, RwTxn};
use itertools::{merge_join_by, EitherOrBoth}; use itertools::{merge_join_by, EitherOrBoth};
pub use partial_dump::PartialDump; pub use partial_dump::PartialDump;
use rand::SeedableRng as _; use rand::SeedableRng as _;
use raw_collections::RawMap; use rustc_hash::FxBuildHasher;
use time::OffsetDateTime; use time::OffsetDateTime;
pub use update_by_function::UpdateByFunction; pub use update_by_function::UpdateByFunction;
@ -776,7 +777,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
index: &Index, index: &Index,
new_fields_ids_map: &mut FieldsIdsMap, new_fields_ids_map: &mut FieldsIdsMap,
primary_key_from_op: Option<&'a str>, primary_key_from_op: Option<&'a str>,
first_document: Option<RawMap<'a>>, first_document: Option<RawMap<'a, FxBuildHasher>>,
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> { ) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.

View File

@ -1,6 +1,8 @@
use std::ops::DerefMut; use std::ops::DerefMut;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator; use rayon::iter::IndexedParallelIterator;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::document_changes::{DocumentChangeContext, DocumentChanges};
@ -75,7 +77,7 @@ where
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
let external_document_id = external_document_id.to_de(); let external_document_id = external_document_id.to_de();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(InternalError::SerdeJson)?; .map_err(InternalError::SerdeJson)?;
let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); let insertion = Insertion::create(docid, external_document_id, Versions::single(document));

View File

@ -1,8 +1,9 @@
use raw_collections::RawMap; use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator; use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _; use rayon::slice::ParallelSlice as _;
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;
use super::document_changes::DocumentChangeContext; use super::document_changes::DocumentChangeContext;
use super::DocumentChanges; use super::DocumentChanges;
@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
if document_id != new_document_id { if document_id != new_document_id {
Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey))
} else { } else {
let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) let raw_new_doc = RawMap::from_raw_value_and_hasher(
.map_err(InternalError::SerdeJson)?; raw_new_doc,
FxBuildHasher,
doc_alloc,
)
.map_err(InternalError::SerdeJson)?;
Ok(Some(DocumentChange::Update(Update::create( Ok(Some(DocumentChange::Update(Update::create(
docid, docid,

View File

@ -1,9 +1,10 @@
use std::collections::BTreeSet; use std::collections::BTreeSet;
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawMap;
use deserr::{Deserr, IntoValue}; use deserr::{Deserr, IntoValue};
use heed::RoTxn; use heed::RoTxn;
use raw_collections::RawMap; use rustc_hash::FxBuildHasher;
use serde::Serialize; use serde::Serialize;
use serde_json::value::RawValue; use serde_json::value::RawValue;
@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> {
docid: DocumentId, docid: DocumentId,
embedding_config: Vec<IndexEmbeddingConfig>, embedding_config: Vec<IndexEmbeddingConfig>,
index: &'t Index, index: &'t Index,
vectors_field: Option<RawMap<'t>>, vectors_field: Option<RawMap<'t, FxBuildHasher>>,
rtxn: &'t RoTxn<'t>, rtxn: &'t RoTxn<'t>,
doc_alloc: &'t Bump, doc_alloc: &'t Bump,
} }
@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> {
}; };
let vectors = document.vectors_field()?; let vectors = document.vectors_field()?;
let vectors_field = match vectors { let vectors_field = match vectors {
Some(vectors) => { Some(vectors) => Some(
Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc)
} .map_err(InternalError::SerdeJson)?,
),
None => None, None => None,
}; };
@ -220,7 +222,7 @@ fn entry_from_raw_value(
pub struct VectorDocumentFromVersions<'doc> { pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str, external_document_id: &'doc str,
vectors: RawMap<'doc>, vectors: RawMap<'doc, FxBuildHasher>,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
} }
@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let document = DocumentFromVersions::new(versions); let document = DocumentFromVersions::new(versions);
if let Some(vectors_field) = document.vectors_field()? { if let Some(vectors_field) = document.vectors_field()? {
let vectors = let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump)
RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; .map_err(UserError::SerdeJson)?;
Ok(Some(Self { external_document_id, vectors, embedders })) Ok(Some(Self { external_document_id, vectors, embedders }))
} else { } else {
Ok(None) Ok(None)