From a751972c5726ff0a23dc433fd9f0702f88e153b9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 10 Dec 2024 14:25:53 +0100 Subject: [PATCH] Prefer using a stable than a random hash builder --- crates/milli/src/update/new/document.rs | 7 ++++--- .../new/extract/searchable/tokenize_document.rs | 3 ++- crates/milli/src/update/new/indexer/de.rs | 3 ++- .../update/new/indexer/document_operation.rs | 17 +++++++++++------ crates/milli/src/update/new/indexer/mod.rs | 3 ++- .../src/update/new/indexer/partial_dump.rs | 5 +++-- .../update/new/indexer/update_by_function.rs | 9 +++++++-- crates/milli/src/update/new/vector_document.rs | 16 +++++++++------- 8 files changed, 40 insertions(+), 23 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 2beefc7d5..930b0c078 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet}; use bumparaw_collections::RawMap; use heed::RoTxn; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::vector_document::VectorDocument; @@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue); #[derive(Debug)] pub struct Versions<'doc> { - data: RawMap<'doc>, + data: RawMap<'doc, FxBuildHasher>, } impl<'doc> Versions<'doc> { pub fn multiple( - mut versions: impl Iterator>>, + mut versions: impl Iterator>>, ) -> Result> { let Some(data) = versions.next() else { return Ok(None) }; let mut data = data?; @@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> { Ok(Some(Self::single(data))) } - pub fn single(version: RawMap<'doc>) -> Self { + pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self { Self { data: version } } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 3aa546272..1c1605b66 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -179,6 +179,7 @@ mod test { use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; + use rustc_hash::FxBuildHasher; use serde_json::json; use serde_json::value::RawValue; @@ -234,7 +235,7 @@ mod test { let bump = Bump::new(); let document: &RawValue = serde_json::from_str(&document).unwrap(); - let document = RawMap::from_raw_value(document, &bump).unwrap(); + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap(); let document = Versions::single(document); let document = DocumentFromVersions::new(&document); diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index 7fd983f29..4d9fa40a1 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -2,6 +2,7 @@ use std::ops::ControlFlow; use bumpalo::Bump; use bumparaw_collections::RawVec; +use rustc_hash::FxBuildHasher; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -394,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: bumparaw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a, FxBuildHasher>, alloc: &'a Bump, } diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 139cef11b..0b7ec493e 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -5,6 +5,7 @@ use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; use rayon::slice::ParallelSlice; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use serde_json::Deserializer; @@ -166,8 +167,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( // Only guess the primary key if it is the first document let retrieved_primary_key = if previous_offset == 0 { - let doc = - RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; + let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer) + .map(Some) + .map_err(UserError::SerdeJson)?; let result = retrieve_or_guess_primary_key( rtxn, @@ -546,7 +548,8 @@ impl MergeChanges for MergeDocumentForReplacement { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); let document = - RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -633,7 +636,8 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); let document = - RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Some(Versions::single(document)) } @@ -647,8 +651,9 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); - let document = RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Ok(document) }); Versions::multiple(versions)? diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 00041ecaf..601645385 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -14,6 +14,7 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; +use rustc_hash::FxBuildHasher; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -776,7 +777,7 @@ pub fn retrieve_or_guess_primary_key<'a>( index: &Index, new_fields_ids_map: &mut FieldsIdsMap, primary_key_from_op: Option<&'a str>, - first_document: Option>, + first_document: Option>, ) -> Result, bool), UserError>> { // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index f687fda99..6e4abd898 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -2,6 +2,7 @@ use std::ops::DerefMut; use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::document_changes::{DocumentChangeContext, DocumentChanges}; @@ -76,8 +77,8 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = - RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?; + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index 59d7098e5..3001648e6 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -3,6 +3,7 @@ use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; +use rustc_hash::FxBuildHasher; use super::document_changes::DocumentChangeContext; use super::DocumentChanges; @@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { if document_id != new_document_id { Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) } else { - let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let raw_new_doc = RawMap::from_raw_value_and_hasher( + raw_new_doc, + FxBuildHasher, + doc_alloc, + ) + .map_err(InternalError::SerdeJson)?; Ok(Some(DocumentChange::Update(Update::create( docid, diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 419c3dc05..8d14a749d 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -4,6 +4,7 @@ use bumpalo::Bump; use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; +use rustc_hash::FxBuildHasher; use serde::Serialize; use serde_json::value::RawValue; @@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> { docid: DocumentId, embedding_config: Vec, index: &'t Index, - vectors_field: Option>, + vectors_field: Option>, rtxn: &'t RoTxn<'t>, doc_alloc: &'t Bump, } @@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> { }; let vectors = document.vectors_field()?; let vectors_field = match vectors { - Some(vectors) => { - Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) - } + Some(vectors) => Some( + RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?, + ), None => None, }; @@ -220,7 +222,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, - vectors: RawMap<'doc>, + vectors: RawMap<'doc, FxBuildHasher>, embedders: &'doc EmbeddingConfigs, } @@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> { ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { - let vectors = - RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; + let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump) + .map_err(UserError::SerdeJson)?; Ok(Some(Self { external_document_id, vectors, embedders })) } else { Ok(None)