From 9420edadf400c7bf87af981fe34e051137196548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Jan 2020 14:43:36 +0100 Subject: [PATCH] Introduce the Postings type to decorrelate the DocumentIds --- meilisearch-core/src/bucket_sort.rs | 8 +- meilisearch-core/src/store/mod.rs | 89 +++++++++++++++++++- meilisearch-core/src/store/postings_lists.rs | 35 +++++--- 3 files changed, 113 insertions(+), 19 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 4d8dfe9c0..b9c13ed35 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -635,12 +635,12 @@ fn fetch_matches<'txn, 'tag>( let is_exact = *is_exact && distance == 0 && input.len() == query.len(); let before_postings_lists_fetching = Instant::now(); - if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? { - postings_lists_original_length += postings_list.len(); + if let Some(Postings { docids, matches }) = postings_lists_store.postings_list(reader, input)? { + postings_lists_original_length += matches.len(); let input = Rc::from(input); - let postings_list = Rc::new(postings_list); - let postings_list_view = PostingsListView::original(input, postings_list); + let matches = Rc::new(matches); + let postings_list_view = PostingsListView::original(input, matches); let mut offset = 0; for group in postings_list_view.linear_group_by_key(|di| di.document_id) { diff --git a/meilisearch-core/src/store/mod.rs b/meilisearch-core/src/store/mod.rs index 9d24afb93..8027dc220 100644 --- a/meilisearch-core/src/store/mod.rs +++ b/meilisearch-core/src/store/mod.rs @@ -22,10 +22,15 @@ pub use self::synonyms::Synonyms; pub use self::updates::Updates; pub use self::updates_results::UpdatesResults; +use std::borrow::Cow; use std::collections::HashSet; +use std::convert::TryInto; +use std::{mem, ptr}; use heed::Result as ZResult; +use heed::{BytesEncode, BytesDecode}; use meilisearch_schema::{Schema, SchemaAttr}; +use sdset::{Set, SetBuf}; use serde::de::{self, Deserialize}; use zerocopy::{AsBytes, FromBytes}; @@ -33,7 +38,7 @@ use crate::criterion::Criteria; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::database::{MainT, UpdateT}; use crate::serde::Deserializer; -use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult}; +use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult}; type BEU64 = zerocopy::U64; type BEU16 = zerocopy::U16; @@ -54,6 +59,88 @@ impl DocumentAttrKey { } } +#[derive(Debug)] +pub struct Postings<'a> { + pub docids: Cow<'a, Set>, + pub matches: Cow<'a, Set>, +} + +struct PostingsCodec; + +impl<'a> BytesEncode<'a> for PostingsCodec { + type EItem = Postings<'a>; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + let u64_size = mem::size_of::(); + let docids_size = item.docids.len() * mem::size_of::(); + let matches_size = item.matches.len() * mem::size_of::(); + + let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size); + + let docids_len = item.docids.len(); + buffer.extend_from_slice(&docids_len.to_be_bytes()); + buffer.extend_from_slice(item.docids.as_bytes()); + buffer.extend_from_slice(item.matches.as_bytes()); + + Some(Cow::Owned(buffer)) + } +} + +fn aligned_to(bytes: &[u8], align: usize) -> bool { + (bytes as *const _ as *const () as usize) % align == 0 +} + +fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option>> +where T: Clone + FromBytes +{ + match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) { + Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))), + None => { + let len = bytes.len(); + let elem_size = mem::size_of::(); + + // ensure that it is the alignment that is wrong + // and the length is valid + if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::()) { + let elems = len / elem_size; + let mut vec = Vec::::with_capacity(elems); + + unsafe { + let dst = vec.as_mut_ptr() as *mut u8; + ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len); + vec.set_len(elems); + } + + return Some(Cow::Owned(SetBuf::new_unchecked(vec))); + } + + None + } + } +} + +impl<'a> BytesDecode<'a> for PostingsCodec { + type DItem = Postings<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let u64_size = mem::size_of::(); + let docid_size = mem::size_of::(); + let docindex_size = mem::size_of::(); + + let (len_bytes, bytes) = bytes.split_at(u64_size); + let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize; + let docids_size = docids_len * docid_size; + + let docids_bytes = &bytes[..docids_size]; + let matches_bytes = &bytes[docids_size..]; + + let docids = from_bytes_to_set(docids_bytes)?; + let matches = from_bytes_to_set(matches_bytes)?; + + Some(Postings { docids, matches }) + } +} + fn main_name(name: &str) -> String { format!("store-{}", name) } diff --git a/meilisearch-core/src/store/postings_lists.rs b/meilisearch-core/src/store/postings_lists.rs index 7e6c3ed71..7d3a29438 100644 --- a/meilisearch-core/src/store/postings_lists.rs +++ b/meilisearch-core/src/store/postings_lists.rs @@ -1,13 +1,19 @@ -use crate::DocIndex; -use crate::database::MainT; -use heed::types::{ByteSlice, CowSlice}; -use heed::Result as ZResult; -use sdset::{Set, SetBuf}; use std::borrow::Cow; +use std::convert::TryInto; +use std::{mem, ptr}; + +use heed::Result as ZResult; +use heed::types::{ByteSlice, CowSlice}; +use sdset::{Set, SetBuf}; +use slice_group_by::GroupBy; + +use crate::database::MainT; +use crate::{DocIndex, DocumentId}; +use crate::store::{Postings, PostingsCodec}; #[derive(Copy, Clone)] pub struct PostingsLists { - pub(crate) postings_lists: heed::Database>, + pub(crate) postings_lists: heed::Database, } impl PostingsLists { @@ -15,9 +21,14 @@ impl PostingsLists { self, writer: &mut heed::RwTxn, word: &[u8], - words_indexes: &Set, + matches: &Set, ) -> ZResult<()> { - self.postings_lists.put(writer, word, words_indexes) + let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect(); + let docids = Cow::Owned(SetBuf::new_unchecked(docids)); + let matches = Cow::Borrowed(matches); + let postings = Postings { docids, matches }; + + self.postings_lists.put(writer, word, &postings) } pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult { @@ -32,11 +43,7 @@ impl PostingsLists { self, reader: &'txn heed::RoTxn, word: &[u8], - ) -> ZResult>>> { - match self.postings_lists.get(reader, word)? { - Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))), - Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))), - None => Ok(None), - } + ) -> ZResult>> { + self.postings_lists.get(reader, word) } }