diff --git a/Cargo.lock b/Cargo.lock index 899893044..3c0f0456e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,7 +1624,6 @@ dependencies = [ "sdset", "serde", "serde_json", - "siphasher", "slice-group-by", "structopt", "tempfile", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index d9b88f89b..341c6336a 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -35,7 +35,6 @@ regex = "1.3.6" sdset = "0.4.0" serde = { version = "1.0.105", features = ["derive"] } serde_json = { version = "1.0.50", features = ["preserve_order"] } -siphasher = "0.3.2" slice-group-by = "0.2.6" unicase = "2.6.0" zerocopy = "0.3.0" diff --git a/meilisearch-core/src/store/main.rs b/meilisearch-core/src/store/main.rs index 9c3f89f39..0a23b82cb 100644 --- a/meilisearch-core/src/store/main.rs +++ b/meilisearch-core/src/store/main.rs @@ -85,10 +85,36 @@ impl Main { } } + pub fn merge_internal_ids(self, writer: &mut heed::RwTxn, new_ids: &sdset::Set) -> ZResult<()> { + use sdset::SetOperation; + + // We do an union of the old and new internal ids. + let internal_ids = self.internal_ids(writer)?; + let internal_ids = sdset::duo::Union::new(&new_ids, &internal_ids).into_set_buf(); + self.put_internal_ids(writer, &internal_ids) + } + pub fn put_user_ids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> { self.main.put::<_, Str, ByteSlice>(writer, USER_IDS_KEY, ids.as_fst().as_bytes()) } + pub fn merge_user_ids(self, writer: &mut heed::RwTxn, new_ids: &fst::Map) -> ZResult<()> { + use fst::{Streamer, IntoStreamer}; + + let user_ids = self.user_ids(writer)?; + + // Do an union of the old and the new set of user ids. + let mut op = user_ids.op().add(new_ids.into_stream()).r#union(); + let mut build = fst::MapBuilder::memory(); + while let Some((userid, values)) = op.next() { + build.insert(userid, values[0].value).unwrap(); + } + let user_ids = build.into_inner().unwrap(); + + // TODO prefer using self.put_user_ids + self.main.put::<_, Str, ByteSlice>(writer, USER_IDS_KEY, user_ids.as_slice()) + } + pub fn user_ids(self, reader: &heed::RoTxn) -> ZResult { match self.main.get::<_, Str, ByteSlice>(reader, USER_IDS_KEY)? { Some(bytes) => { diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index d9d1af328..efafe3e1c 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, BTreeMap}; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; @@ -13,7 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::Deserializer; -use crate::store::{self, DocumentsFields, DocumentsFieldsCounts}; +use crate::store::{self, DocumentsFields, DocumentsFieldsCounts, DiscoverIds}; use crate::update::helpers::{index_value, value_to_number, extract_document_id}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -150,17 +150,26 @@ pub fn apply_addition<'a, 'b>( partial: bool ) -> MResult<()> { let mut documents_additions = HashMap::new(); + let mut new_user_ids = BTreeMap::new(); + let mut new_internal_ids = Vec::with_capacity(new_documents.len()); let mut schema = match index.main.schema(writer)? { Some(schema) => schema, None => return Err(Error::SchemaMissing), }; + // Retrieve the documents ids related structures + let user_ids = index.main.user_ids(writer)?; + let internal_ids = index.main.internal_ids(writer)?; + let mut available_ids = DiscoverIds::new(&internal_ids); + let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; // 1. store documents ids for future deletion for mut document in new_documents { - let document_id = extract_document_id(&primary_key, &document)?; + let (document_id, userid) = extract_document_id(&primary_key, &document, &user_ids, &mut available_ids)?; + new_user_ids.insert(userid, document_id.0); + new_internal_ids.push(document_id); if partial { let mut deserializer = Deserializer { @@ -233,6 +242,11 @@ pub fn apply_addition<'a, 'b>( index.main.put_schema(writer, &schema)?; + let new_user_ids = fst::Map::from_iter(new_user_ids)?; + let new_internal_ids = sdset::SetBuf::from_dirty(new_internal_ids); + index.main.merge_user_ids(writer, &new_user_ids)?; + index.main.merge_internal_ids(writer, &new_internal_ids)?; + Ok(()) } diff --git a/meilisearch-core/src/update/documents_deletion.rs b/meilisearch-core/src/update/documents_deletion.rs index 4526d053d..bfca8b360 100644 --- a/meilisearch-core/src/update/documents_deletion.rs +++ b/meilisearch-core/src/update/documents_deletion.rs @@ -71,7 +71,10 @@ pub fn apply_documents_deletion( writer: &mut heed::RwTxn, index: &store::Index, deletion: Vec, -) -> MResult<()> { +) -> MResult<()> +{ + unimplemented!("When we delete documents we must ask for user ids instead of internal ones"); + let schema = match index.main.schema(writer)? { Some(schema) => schema, None => return Err(Error::SchemaMissing), diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index d17bea3b2..7f9a7b634 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -1,16 +1,15 @@ use std::fmt::Write as _; -use std::hash::{Hash, Hasher}; use indexmap::IndexMap; use meilisearch_schema::IndexedPos; use meilisearch_types::DocumentId; use ordered_float::OrderedFloat; use serde_json::Value; -use siphasher::sip::SipHasher; +use crate::Number; use crate::raw_indexer::RawIndexer; use crate::serde::SerializerError; -use crate::Number; +use crate::store::DiscoverIds; /// Returns the number of words indexed or `None` if the type is unindexable. pub fn index_value( @@ -96,28 +95,43 @@ pub fn value_to_number(value: &Value) -> Option { } } -/// Validates a string representation to be a correct document id and -/// returns the hash of the given type, this is the way we produce documents ids. -pub fn compute_document_id(string: &str) -> Result { - if string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { - let mut s = SipHasher::new(); - string.hash(&mut s); - Ok(DocumentId(s.finish())) +/// Validates a string representation to be a correct document id and returns +/// the corresponding id or generate a new one, this is the way we produce documents ids. +pub fn discover_document_id( + userid: &str, + user_ids: &fst::Map, + available_ids: &mut DiscoverIds<'_>, +) -> Result +{ + if userid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { + match user_ids.get(userid) { + Some(internal_id) => Ok(DocumentId(internal_id)), + None => { + let internal_id = available_ids.next().expect("no more ids available"); + Ok(internal_id) + }, + } } else { Err(SerializerError::InvalidDocumentIdFormat) } } /// Extracts and validates the document id of a document. -pub fn extract_document_id(primary_key: &str, document: &IndexMap) -> Result { +pub fn extract_document_id( + primary_key: &str, + document: &IndexMap, + user_ids: &fst::Map, + available_ids: &mut DiscoverIds<'_>, +) -> Result<(DocumentId, String), SerializerError> +{ match document.get(primary_key) { Some(value) => { - let string = match value { + let userid = match value { Value::Number(number) => number.to_string(), Value::String(string) => string.clone(), _ => return Err(SerializerError::InvalidDocumentIdFormat), }; - compute_document_id(&string) + discover_document_id(&userid, user_ids, available_ids).map(|id| (id, userid)) } None => Err(SerializerError::DocumentIdNotFound), } diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 5b7c33c9d..55bdc9edc 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -9,7 +9,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; -pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id}; +pub use self::helpers::{index_value, value_to_string, value_to_number, discover_document_id, extract_document_id}; pub use self::settings_update::{apply_settings_update, push_settings_update}; use std::cmp;