diff --git a/Cargo.lock b/Cargo.lock index 899893044..3c0f0456e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,7 +1624,6 @@ dependencies = [ "sdset", "serde", "serde_json", - "siphasher", "slice-group-by", "structopt", "tempfile", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index d9b88f89b..341c6336a 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -35,7 +35,6 @@ regex = "1.3.6" sdset = "0.4.0" serde = { version = "1.0.105", features = ["derive"] } serde_json = { version = "1.0.50", features = ["preserve_order"] } -siphasher = "0.3.2" slice-group-by = "0.2.6" unicase = "2.6.0" zerocopy = "0.3.0" diff --git a/meilisearch-core/src/database.rs b/meilisearch-core/src/database.rs index d7f78dca1..5672c3c1e 100644 --- a/meilisearch-core/src/database.rs +++ b/meilisearch-core/src/database.rs @@ -775,12 +775,12 @@ mod tests { assert!(document.is_none()); let document: Option = index - .document(&reader, None, DocumentId(7_900_334_843_754_999_545)) + .document(&reader, None, DocumentId(0)) .unwrap(); assert!(document.is_some()); let document: Option = index - .document(&reader, None, DocumentId(8_367_468_610_878_465_872)) + .document(&reader, None, DocumentId(1)) .unwrap(); assert!(document.is_some()); } @@ -855,12 +855,12 @@ mod tests { assert!(document.is_none()); let document: Option = index - .document(&reader, None, DocumentId(7_900_334_843_754_999_545)) + .document(&reader, None, DocumentId(0)) .unwrap(); assert!(document.is_some()); let document: Option = index - .document(&reader, None, DocumentId(8_367_468_610_878_465_872)) + .document(&reader, None, DocumentId(1)) .unwrap(); assert!(document.is_some()); @@ -897,7 +897,7 @@ mod tests { let reader = db.main_read_txn().unwrap(); let document: Option = index - .document(&reader, None, DocumentId(7_900_334_843_754_999_545)) + .document(&reader, None, DocumentId(0)) .unwrap(); let new_doc1 = serde_json::json!({ @@ -908,7 +908,7 @@ mod tests { assert_eq!(document, Some(new_doc1)); let document: Option = index - .document(&reader, None, DocumentId(8_367_468_610_878_465_872)) + .document(&reader, None, DocumentId(1)) .unwrap(); let new_doc2 = serde_json::json!({ @@ -1080,14 +1080,14 @@ mod tests { assert_matches!( iter.next(), Some(Document { - id: DocumentId(7_900_334_843_754_999_545), + id: DocumentId(0), .. }) ); assert_matches!( iter.next(), Some(Document { - id: DocumentId(8_367_468_610_878_465_872), + id: DocumentId(1), .. }) ); diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 38a1a3c34..1a4b0d2e6 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -191,6 +191,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); + assert_eq!(mem::size_of::(), 12); } } diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 1f1ef28e3..d1b35273d 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -228,7 +228,7 @@ mod tests { builder.into_inner().and_then(Set::from_bytes).unwrap() } - const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { + const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { DocIndex { document_id: DocumentId(document_id), attribute: 0, @@ -238,7 +238,7 @@ mod tests { } } - const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex { + const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex { DocIndex { document_id: DocumentId(document_id), attribute: 0, diff --git a/meilisearch-core/src/store/docs_words.rs b/meilisearch-core/src/store/docs_words.rs index 0ae153b3f..1e33a9bb0 100644 --- a/meilisearch-core/src/store/docs_words.rs +++ b/meilisearch-core/src/store/docs_words.rs @@ -1,4 +1,4 @@ -use super::BEU64; +use super::BEU32; use crate::database::MainT; use crate::DocumentId; use heed::types::{ByteSlice, OwnedType}; @@ -7,7 +7,7 @@ use std::sync::Arc; #[derive(Copy, Clone)] pub struct DocsWords { - pub(crate) docs_words: heed::Database, ByteSlice>, + pub(crate) docs_words: heed::Database, ByteSlice>, } impl DocsWords { @@ -17,13 +17,13 @@ impl DocsWords { document_id: DocumentId, words: &fst::Set, ) -> ZResult<()> { - let document_id = BEU64::new(document_id.0); + let document_id = BEU32::new(document_id.0); let bytes = words.as_fst().as_bytes(); self.docs_words.put(writer, &document_id, bytes) } pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult { - let document_id = BEU64::new(document_id.0); + let document_id = BEU32::new(document_id.0); self.docs_words.delete(writer, &document_id) } @@ -36,7 +36,7 @@ impl DocsWords { reader: &heed::RoTxn, document_id: DocumentId, ) -> ZResult> { - let document_id = BEU64::new(document_id.0); + let document_id = BEU32::new(document_id.0); match self.docs_words.get(reader, &document_id)? { Some(bytes) => { let len = bytes.len(); diff --git a/meilisearch-core/src/store/documents_ids.rs b/meilisearch-core/src/store/documents_ids.rs new file mode 100644 index 000000000..a0628e9e2 --- /dev/null +++ b/meilisearch-core/src/store/documents_ids.rs @@ -0,0 +1,75 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; +use sdset::Set; + +use crate::DocumentId; +use super::cow_set::CowSet; + +pub struct DocumentsIds; + +impl BytesEncode<'_> for DocumentsIds { + type EItem = Set; + + fn bytes_encode(item: &Self::EItem) -> Option> { + CowSet::bytes_encode(item) + } +} + +impl<'a> BytesDecode<'a> for DocumentsIds { + type DItem = Cow<'a, Set>; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + CowSet::bytes_decode(bytes) + } +} + +pub struct DiscoverIds<'a> { + ids_iter: std::slice::Iter<'a, DocumentId>, + left_id: Option, + right_id: Option, + available_range: std::ops::Range, +} + +impl DiscoverIds<'_> { + pub fn new(ids: &Set) -> DiscoverIds { + let mut ids_iter = ids.iter(); + let right_id = ids_iter.next().map(|id| id.0); + let available_range = 0..right_id.unwrap_or(u32::max_value()); + DiscoverIds { ids_iter, left_id: None, right_id, available_range } + } +} + +impl Iterator for DiscoverIds<'_> { + type Item = DocumentId; + + fn next(&mut self) -> Option { + loop { + match self.available_range.next() { + // The available range gives us a new id, we return it. + Some(id) => return Some(DocumentId(id)), + // The available range is exhausted, we need to find the next one. + None if self.available_range.end == u32::max_value() => return None, + None => loop { + self.left_id = self.right_id.take(); + self.right_id = self.ids_iter.next().map(|id| id.0); + match (self.left_id, self.right_id) { + // We found a gap in the used ids, we can yield all ids + // until the end of the gap + (Some(l), Some(r)) => if l.saturating_add(1) != r { + self.available_range = (l + 1)..r; + break; + }, + // The last used id has been reached, we can use all ids + // until u32 MAX + (Some(l), None) => { + self.available_range = l.saturating_add(1)..u32::max_value(); + break; + }, + _ => (), + } + }, + } + } + } +} diff --git a/meilisearch-core/src/store/main.rs b/meilisearch-core/src/store/main.rs index 34a88afcd..d1f7a522c 100644 --- a/meilisearch-core/src/store/main.rs +++ b/meilisearch-core/src/store/main.rs @@ -3,28 +3,31 @@ use std::sync::Arc; use std::collections::HashMap; use chrono::{DateTime, Utc}; -use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str}; use heed::Result as ZResult; +use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str}; use meilisearch_schema::{FieldId, Schema}; +use meilisearch_types::DocumentId; use sdset::Set; use crate::database::MainT; use crate::RankedMap; use crate::settings::RankingRule; -use super::cow_set::CowSet; +use super::{CowSet, DocumentsIds}; +const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting"; const CREATED_AT_KEY: &str = "created-at"; -const ATTRIBUTES_FOR_FACETING: &str = "attributes-for-faceting"; -const RANKING_RULES_KEY: &str = "ranking-rules"; -const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute"; -const STOP_WORDS_KEY: &str = "stop-words"; -const SYNONYMS_KEY: &str = "synonyms"; const CUSTOMS_KEY: &str = "customs"; +const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute"; +const EXTERNAL_DOCIDS_KEY: &str = "external-docids"; const FIELDS_FREQUENCY_KEY: &str = "fields-frequency"; +const INTERNAL_DOCIDS_KEY: &str = "internal-docids"; const NAME_KEY: &str = "name"; const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; const RANKED_MAP_KEY: &str = "ranked-map"; +const RANKING_RULES_KEY: &str = "ranking-rules"; const SCHEMA_KEY: &str = "schema"; +const STOP_WORDS_KEY: &str = "stop-words"; +const SYNONYMS_KEY: &str = "synonyms"; const UPDATED_AT_KEY: &str = "updated-at"; const WORDS_KEY: &str = "words"; @@ -71,9 +74,90 @@ impl Main { self.main.get::<_, Str, SerdeDatetime>(reader, UPDATED_AT_KEY) } + pub fn put_internal_docids(self, writer: &mut heed::RwTxn, ids: &sdset::Set) -> ZResult<()> { + self.main.put::<_, Str, DocumentsIds>(writer, INTERNAL_DOCIDS_KEY, ids) + } + + pub fn internal_docids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult>> { + match self.main.get::<_, Str, DocumentsIds>(reader, INTERNAL_DOCIDS_KEY)? { + Some(ids) => Ok(ids), + None => Ok(Cow::default()), + } + } + + pub fn merge_internal_docids(self, writer: &mut heed::RwTxn, new_ids: &sdset::Set) -> ZResult<()> { + use sdset::SetOperation; + + // We do an union of the old and new internal ids. + let internal_docids = self.internal_docids(writer)?; + let internal_docids = sdset::duo::Union::new(&internal_docids, new_ids).into_set_buf(); + self.put_internal_docids(writer, &internal_docids) + } + + pub fn remove_internal_docids(self, writer: &mut heed::RwTxn, ids: &sdset::Set) -> ZResult<()> { + use sdset::SetOperation; + + // We do a difference of the old and new internal ids. + let internal_docids = self.internal_docids(writer)?; + let internal_docids = sdset::duo::Difference::new(&internal_docids, ids).into_set_buf(); + self.put_internal_docids(writer, &internal_docids) + } + + pub fn put_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> { + self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes()) + } + + pub fn merge_external_docids(self, writer: &mut heed::RwTxn, new_docids: &fst::Map) -> ZResult<()> { + use fst::{Streamer, IntoStreamer}; + + // Do an union of the old and the new set of external docids. + let external_docids = self.external_docids(writer)?; + let mut op = external_docids.op().add(new_docids.into_stream()).r#union(); + let mut build = fst::MapBuilder::memory(); + while let Some((docid, values)) = op.next() { + build.insert(docid, values[0].value).unwrap(); + } + let external_docids = build.into_inner().unwrap(); + + // TODO prefer using self.put_user_ids + self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) + } + + pub fn remove_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> { + use fst::{Streamer, IntoStreamer}; + + // Do an union of the old and the new set of external docids. + let external_docids = self.external_docids(writer)?; + let mut op = external_docids.op().add(ids.into_stream()).difference(); + let mut build = fst::MapBuilder::memory(); + while let Some((docid, values)) = op.next() { + build.insert(docid, values[0].value).unwrap(); + } + let external_docids = build.into_inner().unwrap(); + + // TODO prefer using self.put_external_docids + self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) + } + + pub fn external_docids(self, reader: &heed::RoTxn) -> ZResult { + match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? { + Some(bytes) => { + let len = bytes.len(); + let bytes = Arc::new(bytes.to_owned()); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); + Ok(fst::Map::from(fst)) + }, + None => Ok(fst::Map::default()), + } + } + + pub fn external_to_internal_docid(self, reader: &heed::RoTxn, external_docid: &str) -> ZResult> { + let external_ids = self.external_docids(reader)?; + Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32))) + } + pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { - let bytes = fst.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes) + self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes()) } pub unsafe fn static_words_fst(self, reader: &heed::RoTxn) -> ZResult> { @@ -82,7 +166,7 @@ impl Main { let bytes: &'static [u8] = std::mem::transmute(bytes); let set = fst::Set::from_static_slice(bytes).unwrap(); Ok(Some(set)) - } + }, None => Ok(None), } } @@ -94,7 +178,7 @@ impl Main { let bytes = Arc::new(bytes.to_owned()); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); Ok(Some(fst::Set::from(fst))) - } + }, None => Ok(None), } } @@ -193,15 +277,15 @@ impl Main { } pub fn attributes_for_faceting<'txn>(&self, reader: &'txn heed::RoTxn) -> ZResult>>> { - self.main.get::<_, Str, CowSet>(reader, ATTRIBUTES_FOR_FACETING) + self.main.get::<_, Str, CowSet>(reader, ATTRIBUTES_FOR_FACETING_KEY) } pub fn put_attributes_for_faceting(self, writer: &mut heed::RwTxn, attributes: &Set) -> ZResult<()> { - self.main.put::<_, Str, CowSet>(writer, ATTRIBUTES_FOR_FACETING, attributes) + self.main.put::<_, Str, CowSet>(writer, ATTRIBUTES_FOR_FACETING_KEY, attributes) } pub fn delete_attributes_for_faceting(self, writer: &mut heed::RwTxn) -> ZResult { - self.main.delete::<_, Str>(writer, ATTRIBUTES_FOR_FACETING) + self.main.delete::<_, Str>(writer, ATTRIBUTES_FOR_FACETING_KEY) } pub fn ranking_rules(&self, reader: &heed::RoTxn) -> ZResult>> { diff --git a/meilisearch-core/src/store/mod.rs b/meilisearch-core/src/store/mod.rs index 6448a3441..d19d3f7d3 100644 --- a/meilisearch-core/src/store/mod.rs +++ b/meilisearch-core/src/store/mod.rs @@ -1,26 +1,27 @@ mod cow_set; mod docs_words; -mod prefix_documents_cache; -mod prefix_postings_lists_cache; +mod documents_ids; mod documents_fields; mod documents_fields_counts; +mod facets; mod main; mod postings_lists; +mod prefix_documents_cache; +mod prefix_postings_lists_cache; mod synonyms; mod updates; mod updates_results; -mod facets; +pub use self::cow_set::CowSet; pub use self::docs_words::DocsWords; -pub use self::facets::Facets; -pub use self::prefix_documents_cache::PrefixDocumentsCache; -pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache; pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields}; -pub use self::documents_fields_counts::{ - DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter, -}; +pub use self::documents_fields_counts::{DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter}; +pub use self::documents_ids::{DocumentsIds, DiscoverIds}; +pub use self::facets::Facets; pub use self::main::Main; pub use self::postings_lists::PostingsLists; +pub use self::prefix_documents_cache::PrefixDocumentsCache; +pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache; pub use self::synonyms::Synonyms; pub use self::updates::Updates; pub use self::updates_results::UpdatesResults; @@ -44,20 +45,21 @@ use crate::serde::Deserializer; use crate::settings::SettingsUpdate; use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult}; +type BEU32 = zerocopy::U32; type BEU64 = zerocopy::U64; pub type BEU16 = zerocopy::U16; #[derive(Debug, Copy, Clone, AsBytes, FromBytes)] #[repr(C)] pub struct DocumentFieldIndexedKey { - docid: BEU64, + docid: BEU32, indexed_pos: BEU16, } impl DocumentFieldIndexedKey { fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey { DocumentFieldIndexedKey { - docid: BEU64::new(docid.0), + docid: BEU32::new(docid.0), indexed_pos: BEU16::new(indexed_pos.0), } } @@ -66,14 +68,14 @@ impl DocumentFieldIndexedKey { #[derive(Debug, Copy, Clone, AsBytes, FromBytes)] #[repr(C)] pub struct DocumentFieldStoredKey { - docid: BEU64, + docid: BEU32, field_id: BEU16, } impl DocumentFieldStoredKey { fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey { DocumentFieldStoredKey { - docid: BEU64::new(docid.0), + docid: BEU32::new(docid.0), field_id: BEU16::new(field_id.0), } } @@ -97,7 +99,7 @@ impl<'a> BytesEncode<'a> for PostingsCodec { let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size); - let docids_len = item.docids.len(); + let docids_len = item.docids.len() as u64; buffer.extend_from_slice(&docids_len.to_be_bytes()); buffer.extend_from_slice(item.docids.as_bytes()); buffer.extend_from_slice(item.matches.as_bytes()); diff --git a/meilisearch-core/src/store/prefix_documents_cache.rs b/meilisearch-core/src/store/prefix_documents_cache.rs index 8cde30e95..2bb8700dc 100644 --- a/meilisearch-core/src/store/prefix_documents_cache.rs +++ b/meilisearch-core/src/store/prefix_documents_cache.rs @@ -4,7 +4,7 @@ use heed::types::{OwnedType, CowSlice}; use heed::Result as ZResult; use zerocopy::{AsBytes, FromBytes}; -use super::BEU64; +use super::{BEU64, BEU32}; use crate::{DocumentId, Highlight}; use crate::database::MainT; @@ -13,15 +13,15 @@ use crate::database::MainT; pub struct PrefixKey { prefix: [u8; 4], index: BEU64, - docid: BEU64, + docid: BEU32, } impl PrefixKey { - pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey { + pub fn new(prefix: [u8; 4], index: u64, docid: u32) -> PrefixKey { PrefixKey { prefix, index: BEU64::new(index), - docid: BEU64::new(docid), + docid: BEU32::new(docid), } } } @@ -54,7 +54,7 @@ impl PrefixDocumentsCache { prefix: [u8; 4], ) -> ZResult> { let start = PrefixKey::new(prefix, 0, 0); - let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value()); + let end = PrefixKey::new(prefix, u64::max_value(), u32::max_value()); let iter = self.prefix_documents_cache.range(reader, &(start..=end))?; Ok(PrefixDocumentsIter { iter }) } diff --git a/meilisearch-core/src/update/clear_all.rs b/meilisearch-core/src/update/clear_all.rs index 0c52f5190..a4c5780b9 100644 --- a/meilisearch-core/src/update/clear_all.rs +++ b/meilisearch-core/src/update/clear_all.rs @@ -7,6 +7,8 @@ pub fn apply_clear_all( index: &store::Index, ) -> MResult<()> { index.main.put_words_fst(writer, &fst::Set::default())?; + index.main.put_external_docids(writer, &fst::Map::default())?; + index.main.put_internal_docids(writer, &sdset::SetBuf::default())?; index.main.put_ranked_map(writer, &RankedMap::default())?; index.main.put_number_of_documents(writer, |_| 0)?; index.documents_fields.clear(writer)?; diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index d9d1af328..6858f6d2c 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, BTreeMap}; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; @@ -13,7 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::Deserializer; -use crate::store::{self, DocumentsFields, DocumentsFieldsCounts}; +use crate::store::{self, DocumentsFields, DocumentsFieldsCounts, DiscoverIds}; use crate::update::helpers::{index_value, value_to_number, extract_document_id}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -148,23 +148,40 @@ pub fn apply_addition<'a, 'b>( index: &store::Index, new_documents: Vec>, partial: bool -) -> MResult<()> { - let mut documents_additions = HashMap::new(); - +) -> MResult<()> +{ let mut schema = match index.main.schema(writer)? { Some(schema) => schema, None => return Err(Error::SchemaMissing), }; + // Retrieve the documents ids related structures + let external_docids = index.main.external_docids(writer)?; + let internal_docids = index.main.internal_docids(writer)?; + let mut available_ids = DiscoverIds::new(&internal_docids); + let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; // 1. store documents ids for future deletion + let mut documents_additions = HashMap::new(); + let mut new_external_docids = BTreeMap::new(); + let mut new_internal_docids = Vec::with_capacity(new_documents.len()); + for mut document in new_documents { - let document_id = extract_document_id(&primary_key, &document)?; + let (internal_docid, external_docid) = + extract_document_id( + &primary_key, + &document, + &external_docids, + &mut available_ids, + )?; + + new_external_docids.insert(external_docid, internal_docid.0); + new_internal_docids.push(internal_docid); if partial { let mut deserializer = Deserializer { - document_id, + document_id: internal_docid, reader: writer, documents_fields: index.documents_fields, schema: &schema, @@ -178,12 +195,12 @@ pub fn apply_addition<'a, 'b>( } } } - documents_additions.insert(document_id, document); + documents_additions.insert(internal_docid, document); } - // 2. remove the documents posting lists + // 2. remove the documents postings lists let number_of_inserted_documents = documents_additions.len(); - let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect(); + let documents_ids = new_external_docids.iter().map(|(id, _)| id.clone()).collect(); apply_documents_deletion(writer, index, documents_ids)?; let mut ranked_map = match index.main.ranked_map(writer)? { @@ -233,6 +250,11 @@ pub fn apply_addition<'a, 'b>( index.main.put_schema(writer, &schema)?; + let new_external_docids = fst::Map::from_iter(new_external_docids.iter().map(|(ext, id)| (ext, *id as u64)))?; + let new_internal_docids = sdset::SetBuf::from_dirty(new_internal_docids); + index.main.merge_external_docids(writer, &new_external_docids)?; + index.main.merge_internal_docids(writer, &new_internal_docids)?; + Ok(()) } diff --git a/meilisearch-core/src/update/documents_deletion.rs b/meilisearch-core/src/update/documents_deletion.rs index 4526d053d..a45021cae 100644 --- a/meilisearch-core/src/update/documents_deletion.rs +++ b/meilisearch-core/src/update/documents_deletion.rs @@ -14,7 +14,7 @@ pub struct DocumentsDeletion { updates_store: store::Updates, updates_results_store: store::UpdatesResults, updates_notifier: UpdateEventsEmitter, - documents: Vec, + external_docids: Vec, } impl DocumentsDeletion { @@ -27,12 +27,12 @@ impl DocumentsDeletion { updates_store, updates_results_store, updates_notifier, - documents: Vec::new(), + external_docids: Vec::new(), } } - pub fn delete_document_by_id(&mut self, document_id: DocumentId) { - self.documents.push(document_id); + pub fn delete_document_by_external_docid(&mut self, document_id: String) { + self.external_docids.push(document_id); } pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult { @@ -41,15 +41,15 @@ impl DocumentsDeletion { writer, self.updates_store, self.updates_results_store, - self.documents, + self.external_docids, )?; Ok(update_id) } } -impl Extend for DocumentsDeletion { - fn extend>(&mut self, iter: T) { - self.documents.extend(iter) +impl Extend for DocumentsDeletion { + fn extend>(&mut self, iter: T) { + self.external_docids.extend(iter) } } @@ -57,11 +57,11 @@ pub fn push_documents_deletion( writer: &mut heed::RwTxn, updates_store: store::Updates, updates_results_store: store::UpdatesResults, - deletion: Vec, + external_docids: Vec, ) -> MResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; - let update = Update::documents_deletion(deletion); + let update = Update::documents_deletion(external_docids); updates_store.put_update(writer, last_update_id, &update)?; Ok(last_update_id) @@ -70,8 +70,24 @@ pub fn push_documents_deletion( pub fn apply_documents_deletion( writer: &mut heed::RwTxn, index: &store::Index, - deletion: Vec, -) -> MResult<()> { + external_docids: Vec, +) -> MResult<()> +{ + let (external_docids, internal_docids) = { + let new_external_docids = SetBuf::from_dirty(external_docids); + let mut internal_docids = Vec::new(); + + let old_external_docids = index.main.external_docids(writer)?; + for external_docid in new_external_docids.as_slice() { + if let Some(id) = old_external_docids.get(external_docid) { + internal_docids.push(DocumentId(id as u32)); + } + } + + let new_external_docids = fst::Map::from_iter(new_external_docids.into_iter().map(|k| (k, 0))).unwrap(); + (new_external_docids, SetBuf::from_dirty(internal_docids)) + }; + let schema = match index.main.schema(writer)? { Some(schema) => schema, None => return Err(Error::SchemaMissing), @@ -84,16 +100,15 @@ pub fn apply_documents_deletion( // facet filters deletion if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? { - let facet_map = facets::facet_map_from_docids(writer, &index, &deletion, &attributes_for_facetting)?; + let facet_map = facets::facet_map_from_docids(writer, &index, &internal_docids, &attributes_for_facetting)?; index.facets.remove(writer, facet_map)?; } // collect the ranked attributes according to the schema let ranked_fields = schema.ranked(); - let idset = SetBuf::from_dirty(deletion); let mut words_document_ids = HashMap::new(); - for id in idset { + for id in internal_docids.iter().cloned() { // remove all the ranked attributes from the ranked_map for ranked_attr in ranked_fields { ranked_map.remove(id, *ranked_attr); @@ -163,6 +178,10 @@ pub fn apply_documents_deletion( index.main.put_ranked_map(writer, &ranked_map)?; index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?; + // We apply the changes to the user and internal ids + index.main.remove_external_docids(writer, &external_docids)?; + index.main.remove_internal_docids(writer, &internal_docids)?; + compute_short_prefixes(writer, index)?; Ok(()) diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index d17bea3b2..c7bd05cec 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -1,16 +1,15 @@ use std::fmt::Write as _; -use std::hash::{Hash, Hasher}; use indexmap::IndexMap; use meilisearch_schema::IndexedPos; use meilisearch_types::DocumentId; use ordered_float::OrderedFloat; use serde_json::Value; -use siphasher::sip::SipHasher; +use crate::Number; use crate::raw_indexer::RawIndexer; use crate::serde::SerializerError; -use crate::Number; +use crate::store::DiscoverIds; /// Returns the number of words indexed or `None` if the type is unindexable. pub fn index_value( @@ -96,28 +95,43 @@ pub fn value_to_number(value: &Value) -> Option { } } -/// Validates a string representation to be a correct document id and -/// returns the hash of the given type, this is the way we produce documents ids. -pub fn compute_document_id(string: &str) -> Result { - if string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { - let mut s = SipHasher::new(); - string.hash(&mut s); - Ok(DocumentId(s.finish())) +/// Validates a string representation to be a correct document id and returns +/// the corresponding id or generate a new one, this is the way we produce documents ids. +pub fn discover_document_id( + docid: &str, + external_docids: &fst::Map, + available_docids: &mut DiscoverIds<'_>, +) -> Result +{ + if docid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { + match external_docids.get(docid) { + Some(id) => Ok(DocumentId(id as u32)), + None => { + let internal_id = available_docids.next().expect("no more ids available"); + Ok(internal_id) + }, + } } else { Err(SerializerError::InvalidDocumentIdFormat) } } /// Extracts and validates the document id of a document. -pub fn extract_document_id(primary_key: &str, document: &IndexMap) -> Result { +pub fn extract_document_id( + primary_key: &str, + document: &IndexMap, + external_docids: &fst::Map, + available_docids: &mut DiscoverIds<'_>, +) -> Result<(DocumentId, String), SerializerError> +{ match document.get(primary_key) { Some(value) => { - let string = match value { + let docid = match value { Value::Number(number) => number.to_string(), Value::String(string) => string.clone(), _ => return Err(SerializerError::InvalidDocumentIdFormat), }; - compute_document_id(&string) + discover_document_id(&docid, external_docids, available_docids).map(|id| (id, docid)) } None => Err(SerializerError::DocumentIdNotFound), } diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 5b7c33c9d..d2d771030 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -9,7 +9,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; -pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id}; +pub use self::helpers::{index_value, value_to_string, value_to_number, discover_document_id, extract_document_id}; pub use self::settings_update::{apply_settings_update, push_settings_update}; use std::cmp; @@ -24,7 +24,7 @@ use sdset::Set; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::{store, DocumentId, MResult}; +use crate::{store, MResult}; use crate::database::{MainT, UpdateT}; use crate::settings::SettingsUpdate; @@ -63,7 +63,7 @@ impl Update { } } - fn documents_deletion(data: Vec) -> Update { + fn documents_deletion(data: Vec) -> Update { Update { data: UpdateData::DocumentsDeletion(data), enqueued_at: Utc::now(), @@ -84,7 +84,7 @@ pub enum UpdateData { Customs(Vec), DocumentsAddition(Vec>), DocumentsPartial(Vec>), - DocumentsDeletion(Vec), + DocumentsDeletion(Vec), Settings(SettingsUpdate) } diff --git a/meilisearch-http/src/error.rs b/meilisearch-http/src/error.rs index 295931454..27626c80b 100644 --- a/meilisearch-http/src/error.rs +++ b/meilisearch-http/src/error.rs @@ -22,7 +22,7 @@ pub enum ResponseError { NotFound(String), OpenIndex(String), FilterParsing(String), - RetrieveDocument(u64, String), + RetrieveDocument(u32, String), SearchDocuments(String), PayloadTooLarge, UnsupportedMediaType, @@ -116,7 +116,7 @@ impl ResponseError { ResponseError::Maintenance } - pub fn retrieve_document(doc_id: u64, err: impl fmt::Display) -> ResponseError { + pub fn retrieve_document(doc_id: u32, err: impl fmt::Display) -> ResponseError { ResponseError::RetrieveDocument(doc_id, err.to_string()) } diff --git a/meilisearch-http/src/routes/document.rs b/meilisearch-http/src/routes/document.rs index 3b71e656a..eca88a590 100644 --- a/meilisearch-http/src/routes/document.rs +++ b/meilisearch-http/src/routes/document.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeSet, HashSet}; use actix_web::{web, HttpResponse}; use actix_web_macros::{delete, get, post, put}; use indexmap::IndexMap; -use meilisearch_core::{update, Error}; +use meilisearch_core::update; use serde::Deserialize; use serde_json::Value; @@ -43,11 +43,13 @@ async fn get_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?; let reader = data.db.main_read_txn()?; + let internal_id = index.main + .external_to_internal_docid(&reader, &path.document_id)? + .ok_or(ResponseError::document_not_found(&path.document_id))?; let response: Document = index - .document(&reader, None, document_id)? + .document(&reader, None, internal_id)? .ok_or(ResponseError::document_not_found(&path.document_id))?; Ok(HttpResponse::Ok().json(response)) @@ -66,12 +68,10 @@ async fn delete_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?; - let mut update_writer = data.db.update_write_txn()?; let mut documents_deletion = index.documents_deletion(); - documents_deletion.delete_document_by_id(document_id); + documents_deletion.delete_document_by_external_docid(path.document_id.clone()); let update_id = documents_deletion.finalize(&mut update_writer)?; @@ -239,8 +239,7 @@ async fn delete_documents( for document_id in body.into_inner() { let document_id = update::value_to_string(&document_id); - let document_id = update::compute_document_id(&document_id).map_err(Error::Serializer)?; - documents_deletion.delete_document_by_id(document_id); + documents_deletion.delete_document_by_external_docid(document_id); } let update_id = documents_deletion.finalize(&mut writer)?; diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index fac361415..3e14521e0 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; #[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[repr(C)] -pub struct DocumentId(pub u64); +pub struct DocumentId(pub u32); /// This structure represent the position of a word /// in a document and its attributes.