From 89637bcaafc43a353d825a7478b3c3b58111e5d8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 11:12:27 +0100 Subject: [PATCH] Use bumparaw-collections in Meilisearch/milli --- Cargo.lock | 33 ++++++----- crates/index-scheduler/Cargo.toml | 6 +- crates/meilisearch-types/Cargo.toml | 2 +- .../meilisearch-types/src/document_formats.rs | 2 +- crates/milli/Cargo.toml | 2 +- crates/milli/src/prompt/document.rs | 59 ++++++++++--------- crates/milli/src/update/new/document.rs | 2 +- crates/milli/src/update/new/extract/cache.rs | 6 +- .../extract/searchable/tokenize_document.rs | 2 +- crates/milli/src/update/new/indexer/de.rs | 11 ++-- .../update/new/indexer/document_operation.rs | 12 ++-- crates/milli/src/update/new/indexer/mod.rs | 2 +- .../src/update/new/indexer/partial_dump.rs | 5 +- .../update/new/indexer/update_by_function.rs | 2 +- .../milli/src/update/new/vector_document.rs | 2 +- 15 files changed, 78 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3c2fb711e..a57391bfc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -706,6 +706,20 @@ dependencies = [ "serde", ] +[[package]] +name = "bumparaw-collections" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7495aa71334069997d1b4ff536a4a01542981774a1654d4dfb00f29db3aedcef" +dependencies = [ + "allocator-api2", + "bitpacking", + "bumpalo", + "hashbrown 0.15.1", + "serde", + "serde_json", +] + [[package]] name = "byte-unit" version = "5.1.4" @@ -2617,6 +2631,7 @@ dependencies = [ "big_s", "bincode", "bumpalo", + "bumparaw-collections", "crossbeam-channel", "csv", "derive_builder 0.20.0", @@ -2631,7 +2646,6 @@ dependencies = [ "meilisearch-types", "memmap2", "page_size", - "raw-collections", "rayon", "roaring", "serde", @@ -3549,6 +3563,7 @@ dependencies = [ "actix-web", "anyhow", "bumpalo", + "bumparaw-collections", "convert_case 0.6.0", "csv", "deserr", @@ -3561,7 +3576,6 @@ dependencies = [ "meili-snap", "memmap2", "milli", - "raw-collections", "roaring", "serde", "serde-cs", @@ -3618,6 +3632,7 @@ dependencies = [ "bincode", "bstr", "bumpalo", + "bumparaw-collections", "bytemuck", "byteorder", "candle-core", @@ -3656,7 +3671,6 @@ dependencies = [ "once_cell", "ordered-float", "rand", - "raw-collections", "rayon", "rayon-par-bridge", "rhai", @@ -4487,19 +4501,6 @@ dependencies = [ "rand", ] -[[package]] -name = "raw-collections" -version = "0.1.0" -source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a" -dependencies = [ - "allocator-api2", - "bitpacking", - "bumpalo", - "hashbrown 0.15.1", - "serde", - "serde_json", -] - [[package]] name = "raw-cpuid" version = "10.7.0" diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index ad4c1b4b9..a2b9debec 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -13,6 +13,8 @@ license.workspace = true [dependencies] anyhow = "1.0.86" bincode = "1.3.3" +bumpalo = "3.16.0" +bumparaw-collections = "0.1.1" csv = "1.3.0" derive_builder = "0.20.0" dump = { path = "../dump" } @@ -21,8 +23,8 @@ file-store = { path = "../file-store" } flate2 = "1.0.30" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.4" page_size = "0.6.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } @@ -30,7 +32,6 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" tempfile = "3.10.1" thiserror = "1.0.61" -memmap2 = "0.9.4" time = { version = "0.3.36", features = [ "serde-well-known", "formatting", @@ -40,7 +41,6 @@ time = { version = "0.3.36", features = [ tracing = "0.1.40" ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } -bumpalo = "3.16.0" [dev-dependencies] arroy = "0.5.0" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index aca06a018..b91689ed7 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } +bumparaw-collections = "0.1.1" roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 008be4022..c6e8ad907 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -4,10 +4,10 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use bumpalo::Bump; +use bumparaw_collections::RawMap; use memmap2::Mmap; use milli::documents::Error; use milli::Object; -use raw_collections::RawMap; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 2a959b654..ae1edd168 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -91,8 +91,8 @@ ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.15.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } bumpalo = "3.16.0" +bumparaw-collections = "0.1.1" thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index dea7946da..5232b6788 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -3,12 +3,12 @@ use std::collections::BTreeMap; use std::fmt::{self, Debug}; use bumpalo::Bump; +use bumparaw_collections::{RawMap, RawVec, Value}; use liquid::model::{ ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; -use raw_collections::{RawMap, RawVec}; use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; @@ -245,12 +245,12 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, #[derive(Debug)] struct ParseableValue<'doc> { - value: raw_collections::Value<'doc>, + value: Value<'doc>, } impl<'doc> ParseableValue<'doc> { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { - let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); + let value = Value::from_raw_value(value, doc_alloc).unwrap(); Self { value } } @@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn render(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.render(), Value::Bool(v) => v.render(), @@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn source(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.source(), Value::Bool(v) => ValueView::source(v), @@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn type_name(&self) -> &'static str { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.type_name(), Value::Bool(v) => v.type_name(), @@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn query_state(&self, state: State) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::query_state(&LiquidValue::Nil, state), Value::Bool(v) => ValueView::query_state(v, state), @@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_kstr(&self) -> KStringCow<'_> { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Bool(v) => ValueView::to_kstr(v), @@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_value(&self) -> LiquidValue { - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil, Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { - raw_collections::value::Number::PosInt(number) => { + Number::PosInt(number) => { let number: i64 = match (*number).try_into() { Ok(number) => number, Err(_) => { @@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { }; LiquidValue::Scalar(ScalarCow::new(number)) } - raw_collections::value::Number::NegInt(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } - raw_collections::value::Number::Finite(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } + Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)), + Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)), }, Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), @@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn as_scalar(&self) -> Option> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { @@ -576,34 +580,35 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn is_scalar(&self) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) } fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { - if let raw_collections::Value::Array(array) = &self.value { + if let Value::Array(array) = &self.value { return Some(ParseableArray::as_parseable(array) as _); } None } fn is_array(&self) -> bool { - matches!(&self.value, raw_collections::Value::Array(_)) + matches!(&self.value, bumparaw_collections::Value::Array(_)) } fn as_object(&self) -> Option<&dyn ObjectView> { - if let raw_collections::Value::Object(object) = &self.value { + if let Value::Object(object) = &self.value { return Some(ParseableMap::as_parseable(object) as _); } None } fn is_object(&self) -> bool { - matches!(&self.value, raw_collections::Value::Object(_)) + matches!(&self.value, bumparaw_collections::Value::Object(_)) } fn is_nil(&self) -> bool { - matches!(&self.value, raw_collections::Value::Null) + matches!(&self.value, bumparaw_collections::Value::Null) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b1a2218f2..2beefc7d5 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -1,7 +1,7 @@ use std::collections::{BTreeMap, BTreeSet}; +use bumparaw_collections::RawMap; use heed::RoTxn; -use raw_collections::RawMap; use serde_json::value::RawValue; use super::vector_document::VectorDocument; diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 658a3127c..09ca60211 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -69,12 +69,12 @@ use std::io::BufReader; use std::{io, iter, mem}; use bumpalo::Bump; +use bumparaw_collections::bbbul::{BitPacker, BitPacker4x}; +use bumparaw_collections::map::FrozenMap; +use bumparaw_collections::{Bbbul, FrozenBbbul}; use grenad::ReaderCursor; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use raw_collections::bbbul::{BitPacker, BitPacker4x}; -use raw_collections::map::FrozenMap; -use raw_collections::{Bbbul, FrozenBbbul}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index ffdce5b7e..3aa546272 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -176,9 +176,9 @@ pub fn tokenizer_builder<'a>( #[cfg(test)] mod test { use bumpalo::Bump; + use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; use serde_json::json; use serde_json::value::RawValue; diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index c9808360e..7fd983f29 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -1,6 +1,7 @@ use std::ops::ControlFlow; use bumpalo::Bump; +use bumparaw_collections::RawVec; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -360,7 +361,7 @@ impl<'a> DeserrRawValue<'a> { } pub struct DeserrRawVec<'a> { - vec: raw_collections::RawVec<'a>, + vec: RawVec<'a>, alloc: &'a Bump, } @@ -379,7 +380,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> { } pub struct DeserrRawVecIter<'a> { - it: raw_collections::vec::iter::IntoIter<'a>, + it: bumparaw_collections::vec::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -393,7 +394,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: raw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a>, alloc: &'a Bump, } @@ -416,7 +417,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> { } pub struct DeserrRawMapIter<'a> { - it: raw_collections::map::iter::IntoIter<'a>, + it: bumparaw_collections::map::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -615,7 +616,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { where A: serde::de::SeqAccess<'de>, { - let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); + let mut raw_vec = RawVec::new_in(self.alloc); while let Some(next) = seq.next_element()? { raw_vec.push(next); } diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 2a381d5d1..139cef11b 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,9 +1,9 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; +use bumparaw_collections::RawMap; use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; -use raw_collections::RawMap; use rayon::slice::ParallelSlice; use serde_json::value::RawValue; use serde_json::Deserializer; @@ -545,8 +545,8 @@ impl MergeChanges for MergeDocumentForReplacement { match operations.last() { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -632,8 +632,8 @@ impl MergeChanges for MergeDocumentForUpdates { } }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; Some(Versions::single(document)) } @@ -647,7 +647,7 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + let document = RawMap::from_raw_value(document, doc_alloc) .map_err(UserError::SerdeJson)?; Ok(document) }); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 59088bd47..00041ecaf 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -4,6 +4,7 @@ use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; +use bumparaw_collections::RawMap; use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; @@ -13,7 +14,6 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; -use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 2cc653813..f687fda99 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -1,5 +1,6 @@ use std::ops::DerefMut; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use serde_json::value::RawValue; @@ -75,8 +76,8 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let document = + RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index a8e3e38a8..59d7098e5 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -1,4 +1,4 @@ -use raw_collections::RawMap; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 319730db0..419c3dc05 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -1,9 +1,9 @@ use std::collections::BTreeSet; use bumpalo::Bump; +use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; -use raw_collections::RawMap; use serde::Serialize; use serde_json::value::RawValue;