Use bumparaw-collections in Meilisearch/milli

This commit is contained in:
Kerollmops 2024-12-10 11:12:27 +01:00
parent 1995040846
commit 89637bcaaf
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
15 changed files with 78 additions and 70 deletions

33
Cargo.lock generated
View File

@ -706,6 +706,20 @@ dependencies = [
"serde",
]
[[package]]
name = "bumparaw-collections"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7495aa71334069997d1b4ff536a4a01542981774a1654d4dfb00f29db3aedcef"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]]
name = "byte-unit"
version = "5.1.4"
@ -2617,6 +2631,7 @@ dependencies = [
"big_s",
"bincode",
"bumpalo",
"bumparaw-collections",
"crossbeam-channel",
"csv",
"derive_builder 0.20.0",
@ -2631,7 +2646,6 @@ dependencies = [
"meilisearch-types",
"memmap2",
"page_size",
"raw-collections",
"rayon",
"roaring",
"serde",
@ -3549,6 +3563,7 @@ dependencies = [
"actix-web",
"anyhow",
"bumpalo",
"bumparaw-collections",
"convert_case 0.6.0",
"csv",
"deserr",
@ -3561,7 +3576,6 @@ dependencies = [
"meili-snap",
"memmap2",
"milli",
"raw-collections",
"roaring",
"serde",
"serde-cs",
@ -3618,6 +3632,7 @@ dependencies = [
"bincode",
"bstr",
"bumpalo",
"bumparaw-collections",
"bytemuck",
"byteorder",
"candle-core",
@ -3656,7 +3671,6 @@ dependencies = [
"once_cell",
"ordered-float",
"rand",
"raw-collections",
"rayon",
"rayon-par-bridge",
"rhai",
@ -4487,19 +4501,6 @@ dependencies = [
"rand",
]
[[package]]
name = "raw-collections"
version = "0.1.0"
source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]]
name = "raw-cpuid"
version = "10.7.0"

View File

@ -13,6 +13,8 @@ license.workspace = true
[dependencies]
anyhow = "1.0.86"
bincode = "1.3.3"
bumpalo = "3.16.0"
bumparaw-collections = "0.1.1"
csv = "1.3.0"
derive_builder = "0.20.0"
dump = { path = "../dump" }
@ -21,8 +23,8 @@ file-store = { path = "../file-store" }
flate2 = "1.0.30"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
memmap2 = "0.9.4"
page_size = "0.6.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
rayon = "1.10.0"
roaring = { version = "0.10.7", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] }
@ -30,7 +32,6 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] }
synchronoise = "1.0.1"
tempfile = "3.10.1"
thiserror = "1.0.61"
memmap2 = "0.9.4"
time = { version = "0.3.36", features = [
"serde-well-known",
"formatting",
@ -40,7 +41,6 @@ time = { version = "0.3.36", features = [
tracing = "0.1.40"
ureq = "2.10.0"
uuid = { version = "1.10.0", features = ["serde", "v4"] }
bumpalo = "3.16.0"
[dev-dependencies]
arroy = "0.5.0"

View File

@ -24,7 +24,7 @@ flate2 = "1.0.30"
fst = "0.4.7"
memmap2 = "0.9.4"
milli = { path = "../milli" }
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
bumparaw-collections = "0.1.1"
roaring = { version = "0.10.7", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] }
serde-cs = "0.2.4"

View File

@ -4,10 +4,10 @@ use std::io::{self, BufWriter};
use std::marker::PhantomData;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use memmap2::Mmap;
use milli::documents::Error;
use milli::Object;
use raw_collections::RawMap;
use serde::de::{SeqAccess, Visitor};
use serde::{Deserialize, Deserializer};
use serde_json::error::Category;

View File

@ -91,8 +91,8 @@ ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2"
rayon-par-bridge = "0.1.0"
hashbrown = "0.15.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
bumpalo = "3.16.0"
bumparaw-collections = "0.1.1"
thread_local = "1.1.8"
allocator-api2 = "0.2.18"
rustc-hash = "2.0.0"

View File

@ -3,12 +3,12 @@ use std::collections::BTreeMap;
use std::fmt::{self, Debug};
use bumpalo::Bump;
use bumparaw_collections::{RawMap, RawVec, Value};
use liquid::model::{
ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State,
Value as LiquidValue,
};
use liquid::{ObjectView, ValueView};
use raw_collections::{RawMap, RawVec};
use serde_json::value::RawValue;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
@ -245,12 +245,12 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc,
#[derive(Debug)]
struct ParseableValue<'doc> {
value: raw_collections::Value<'doc>,
value: Value<'doc>,
}
impl<'doc> ParseableValue<'doc> {
pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self {
let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap();
let value = Value::from_raw_value(value, doc_alloc).unwrap();
Self { value }
}
@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn render(&self) -> DisplayCow<'_> {
use raw_collections::value::Number;
use raw_collections::Value;
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.render(),
Value::Bool(v) => v.render(),
@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn source(&self) -> DisplayCow<'_> {
use raw_collections::value::Number;
use raw_collections::Value;
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.source(),
Value::Bool(v) => ValueView::source(v),
@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn type_name(&self) -> &'static str {
use raw_collections::value::Number;
use raw_collections::Value;
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.type_name(),
Value::Bool(v) => v.type_name(),
@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn query_state(&self, state: State) -> bool {
use raw_collections::Value;
use bumparaw_collections::Value;
match &self.value {
Value::Null => ValueView::query_state(&LiquidValue::Nil, state),
Value::Bool(v) => ValueView::query_state(v, state),
@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn to_kstr(&self) -> KStringCow<'_> {
use raw_collections::Value;
use bumparaw_collections::Value;
match &self.value {
Value::Null => ValueView::to_kstr(&LiquidValue::Nil),
Value::Bool(v) => ValueView::to_kstr(v),
@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn to_value(&self) -> LiquidValue {
use raw_collections::Value;
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil,
Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number {
raw_collections::value::Number::PosInt(number) => {
Number::PosInt(number) => {
let number: i64 = match (*number).try_into() {
Ok(number) => number,
Err(_) => {
@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
};
LiquidValue::Scalar(ScalarCow::new(number))
}
raw_collections::value::Number::NegInt(number) => {
LiquidValue::Scalar(ScalarCow::new(*number))
}
raw_collections::value::Number::Finite(number) => {
LiquidValue::Scalar(ScalarCow::new(*number))
}
Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
},
Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())),
Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(),
@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> {
use raw_collections::value::Number;
use raw_collections::Value;
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
match &self.value {
Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number {
@ -576,34 +580,35 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn is_scalar(&self) -> bool {
use raw_collections::Value;
use bumparaw_collections::Value;
matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_))
}
fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> {
if let raw_collections::Value::Array(array) = &self.value {
if let Value::Array(array) = &self.value {
return Some(ParseableArray::as_parseable(array) as _);
}
None
}
fn is_array(&self) -> bool {
matches!(&self.value, raw_collections::Value::Array(_))
matches!(&self.value, bumparaw_collections::Value::Array(_))
}
fn as_object(&self) -> Option<&dyn ObjectView> {
if let raw_collections::Value::Object(object) = &self.value {
if let Value::Object(object) = &self.value {
return Some(ParseableMap::as_parseable(object) as _);
}
None
}
fn is_object(&self) -> bool {
matches!(&self.value, raw_collections::Value::Object(_))
matches!(&self.value, bumparaw_collections::Value::Object(_))
}
fn is_nil(&self) -> bool {
matches!(&self.value, raw_collections::Value::Null)
matches!(&self.value, bumparaw_collections::Value::Null)
}
}

View File

@ -1,7 +1,7 @@
use std::collections::{BTreeMap, BTreeSet};
use bumparaw_collections::RawMap;
use heed::RoTxn;
use raw_collections::RawMap;
use serde_json::value::RawValue;
use super::vector_document::VectorDocument;

View File

@ -69,12 +69,12 @@ use std::io::BufReader;
use std::{io, iter, mem};
use bumpalo::Bump;
use bumparaw_collections::bbbul::{BitPacker, BitPacker4x};
use bumparaw_collections::map::FrozenMap;
use bumparaw_collections::{Bbbul, FrozenBbbul};
use grenad::ReaderCursor;
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use raw_collections::bbbul::{BitPacker, BitPacker4x};
use raw_collections::map::FrozenMap;
use raw_collections::{Bbbul, FrozenBbbul};
use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;

View File

@ -176,9 +176,9 @@ pub fn tokenizer_builder<'a>(
#[cfg(test)]
mod test {
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use charabia::TokenizerBuilder;
use meili_snap::snapshot;
use raw_collections::RawMap;
use serde_json::json;
use serde_json::value::RawValue;

View File

@ -1,6 +1,7 @@
use std::ops::ControlFlow;
use bumpalo::Bump;
use bumparaw_collections::RawVec;
use serde::de::{DeserializeSeed, Deserializer as _, Visitor};
use serde_json::value::RawValue;
@ -360,7 +361,7 @@ impl<'a> DeserrRawValue<'a> {
}
pub struct DeserrRawVec<'a> {
vec: raw_collections::RawVec<'a>,
vec: RawVec<'a>,
alloc: &'a Bump,
}
@ -379,7 +380,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> {
}
pub struct DeserrRawVecIter<'a> {
it: raw_collections::vec::iter::IntoIter<'a>,
it: bumparaw_collections::vec::iter::IntoIter<'a>,
alloc: &'a Bump,
}
@ -393,7 +394,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> {
}
pub struct DeserrRawMap<'a> {
map: raw_collections::RawMap<'a>,
map: bumparaw_collections::RawMap<'a>,
alloc: &'a Bump,
}
@ -416,7 +417,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> {
}
pub struct DeserrRawMapIter<'a> {
it: raw_collections::map::iter::IntoIter<'a>,
it: bumparaw_collections::map::iter::IntoIter<'a>,
alloc: &'a Bump,
}
@ -615,7 +616,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> {
where
A: serde::de::SeqAccess<'de>,
{
let mut raw_vec = raw_collections::RawVec::new_in(self.alloc);
let mut raw_vec = RawVec::new_in(self.alloc);
while let Some(next) = seq.next_element()? {
raw_vec.push(next);
}

View File

@ -1,9 +1,9 @@
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use hashbrown::hash_map::Entry;
use heed::RoTxn;
use memmap2::Mmap;
use raw_collections::RawMap;
use rayon::slice::ParallelSlice;
use serde_json::value::RawValue;
use serde_json::Deserializer;
@ -545,8 +545,8 @@ impl MergeChanges for MergeDocumentForReplacement {
match operations.last() {
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
let document =
RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?;
if is_new {
Ok(Some(DocumentChange::Insertion(Insertion::create(
@ -632,8 +632,8 @@ impl MergeChanges for MergeDocumentForUpdates {
}
};
let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
let document =
RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?;
Some(Versions::single(document))
}
@ -647,7 +647,7 @@ impl MergeChanges for MergeDocumentForUpdates {
};
let document = serde_json::from_slice(content).unwrap();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
let document = RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
Ok(document)
});

View File

@ -4,6 +4,7 @@ use std::sync::{OnceLock, RwLock};
use std::thread::{self, Builder};
use big_s::S;
use bumparaw_collections::RawMap;
use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats};
@ -13,7 +14,6 @@ use heed::{RoTxn, RwTxn};
use itertools::{merge_join_by, EitherOrBoth};
pub use partial_dump::PartialDump;
use rand::SeedableRng as _;
use raw_collections::RawMap;
use time::OffsetDateTime;
pub use update_by_function::UpdateByFunction;

View File

@ -1,5 +1,6 @@
use std::ops::DerefMut;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use serde_json::value::RawValue;
@ -75,8 +76,8 @@ where
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
let external_document_id = external_document_id.to_de();
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(InternalError::SerdeJson)?;
let document =
RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?;
let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
Ok(Some(DocumentChange::Insertion(insertion)))

View File

@ -1,4 +1,4 @@
use raw_collections::RawMap;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _;
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};

View File

@ -1,9 +1,9 @@
use std::collections::BTreeSet;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use deserr::{Deserr, IntoValue};
use heed::RoTxn;
use raw_collections::RawMap;
use serde::Serialize;
use serde_json::value::RawValue;