mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Introduce the Postings type to decorrelate the DocumentIds
This commit is contained in:
parent
d724a7659e
commit
9420edadf4
@ -635,12 +635,12 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
|
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
let before_postings_lists_fetching = Instant::now();
|
||||||
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
|
if let Some(Postings { docids, matches }) = postings_lists_store.postings_list(reader, input)? {
|
||||||
postings_lists_original_length += postings_list.len();
|
postings_lists_original_length += matches.len();
|
||||||
|
|
||||||
let input = Rc::from(input);
|
let input = Rc::from(input);
|
||||||
let postings_list = Rc::new(postings_list);
|
let matches = Rc::new(matches);
|
||||||
let postings_list_view = PostingsListView::original(input, postings_list);
|
let postings_list_view = PostingsListView::original(input, matches);
|
||||||
|
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
||||||
|
@ -22,10 +22,15 @@ pub use self::synonyms::Synonyms;
|
|||||||
pub use self::updates::Updates;
|
pub use self::updates::Updates;
|
||||||
pub use self::updates_results::UpdatesResults;
|
pub use self::updates_results::UpdatesResults;
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::{mem, ptr};
|
||||||
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
|
use heed::{BytesEncode, BytesDecode};
|
||||||
use meilisearch_schema::{Schema, SchemaAttr};
|
use meilisearch_schema::{Schema, SchemaAttr};
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
use serde::de::{self, Deserialize};
|
use serde::de::{self, Deserialize};
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
@ -33,7 +38,7 @@ use crate::criterion::Criteria;
|
|||||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||||
use crate::database::{MainT, UpdateT};
|
use crate::database::{MainT, UpdateT};
|
||||||
use crate::serde::Deserializer;
|
use crate::serde::Deserializer;
|
||||||
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
|
||||||
|
|
||||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||||
@ -54,6 +59,88 @@ impl DocumentAttrKey {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Postings<'a> {
|
||||||
|
pub docids: Cow<'a, Set<DocumentId>>,
|
||||||
|
pub matches: Cow<'a, Set<DocIndex>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PostingsCodec;
|
||||||
|
|
||||||
|
impl<'a> BytesEncode<'a> for PostingsCodec {
|
||||||
|
type EItem = Postings<'a>;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||||
|
let u64_size = mem::size_of::<u64>();
|
||||||
|
let docids_size = item.docids.len() * mem::size_of::<DocumentId>();
|
||||||
|
let matches_size = item.matches.len() * mem::size_of::<DocIndex>();
|
||||||
|
|
||||||
|
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
|
||||||
|
|
||||||
|
let docids_len = item.docids.len();
|
||||||
|
buffer.extend_from_slice(&docids_len.to_be_bytes());
|
||||||
|
buffer.extend_from_slice(item.docids.as_bytes());
|
||||||
|
buffer.extend_from_slice(item.matches.as_bytes());
|
||||||
|
|
||||||
|
Some(Cow::Owned(buffer))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
||||||
|
(bytes as *const _ as *const () as usize) % align == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option<Cow<'a, Set<T>>>
|
||||||
|
where T: Clone + FromBytes
|
||||||
|
{
|
||||||
|
match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) {
|
||||||
|
Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))),
|
||||||
|
None => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let elem_size = mem::size_of::<T>();
|
||||||
|
|
||||||
|
// ensure that it is the alignment that is wrong
|
||||||
|
// and the length is valid
|
||||||
|
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<T>()) {
|
||||||
|
let elems = len / elem_size;
|
||||||
|
let mut vec = Vec::<T>::with_capacity(elems);
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let dst = vec.as_mut_ptr() as *mut u8;
|
||||||
|
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
|
||||||
|
vec.set_len(elems);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(Cow::Owned(SetBuf::new_unchecked(vec)));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BytesDecode<'a> for PostingsCodec {
|
||||||
|
type DItem = Postings<'a>;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let u64_size = mem::size_of::<u64>();
|
||||||
|
let docid_size = mem::size_of::<DocumentId>();
|
||||||
|
let docindex_size = mem::size_of::<DocIndex>();
|
||||||
|
|
||||||
|
let (len_bytes, bytes) = bytes.split_at(u64_size);
|
||||||
|
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;
|
||||||
|
let docids_size = docids_len * docid_size;
|
||||||
|
|
||||||
|
let docids_bytes = &bytes[..docids_size];
|
||||||
|
let matches_bytes = &bytes[docids_size..];
|
||||||
|
|
||||||
|
let docids = from_bytes_to_set(docids_bytes)?;
|
||||||
|
let matches = from_bytes_to_set(matches_bytes)?;
|
||||||
|
|
||||||
|
Some(Postings { docids, matches })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn main_name(name: &str) -> String {
|
fn main_name(name: &str) -> String {
|
||||||
format!("store-{}", name)
|
format!("store-{}", name)
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,19 @@
|
|||||||
use crate::DocIndex;
|
|
||||||
use crate::database::MainT;
|
|
||||||
use heed::types::{ByteSlice, CowSlice};
|
|
||||||
use heed::Result as ZResult;
|
|
||||||
use sdset::{Set, SetBuf};
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::{mem, ptr};
|
||||||
|
|
||||||
|
use heed::Result as ZResult;
|
||||||
|
use heed::types::{ByteSlice, CowSlice};
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::{DocIndex, DocumentId};
|
||||||
|
use crate::store::{Postings, PostingsCodec};
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct PostingsLists {
|
pub struct PostingsLists {
|
||||||
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
|
pub(crate) postings_lists: heed::Database<ByteSlice, PostingsCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PostingsLists {
|
impl PostingsLists {
|
||||||
@ -15,9 +21,14 @@ impl PostingsLists {
|
|||||||
self,
|
self,
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
words_indexes: &Set<DocIndex>,
|
matches: &Set<DocIndex>,
|
||||||
) -> ZResult<()> {
|
) -> ZResult<()> {
|
||||||
self.postings_lists.put(writer, word, words_indexes)
|
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
|
||||||
|
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
|
||||||
|
let matches = Cow::Borrowed(matches);
|
||||||
|
let postings = Postings { docids, matches };
|
||||||
|
|
||||||
|
self.postings_lists.put(writer, word, &postings)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
|
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
|
||||||
@ -32,11 +43,7 @@ impl PostingsLists {
|
|||||||
self,
|
self,
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
) -> ZResult<Option<Postings<'txn>>> {
|
||||||
match self.postings_lists.get(reader, word)? {
|
self.postings_lists.get(reader, word)
|
||||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
|
||||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user