From 81c573ec92fae7806590b7b6f051ae39733d56a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Jan 2020 15:30:43 +0100 Subject: [PATCH] Add the raw document IDs to the postings lists --- meilisearch-core/src/bucket_sort.rs | 11 ++-- meilisearch-core/src/query_tree.rs | 55 +++++++++---------- meilisearch-core/src/store/mod.rs | 5 +- meilisearch-core/src/store/postings_lists.rs | 6 +- .../src/store/prefix_postings_lists_cache.rs | 25 +++++---- .../src/update/documents_addition.rs | 7 +-- .../src/update/documents_deletion.rs | 4 +- 7 files changed, 54 insertions(+), 59 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index b9c13ed35..15ab54991 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -30,6 +30,7 @@ use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult}; use crate::query_tree::Context as QTContext; +use crate::store::Postings; pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, @@ -569,12 +570,12 @@ fn fetch_matches<'txn, 'tag>( number_of_words += 1; let before_postings_lists_fetching = Instant::now(); - if let Some(postings_list) = pplc_store.prefix_postings_list(reader, prefix)? { + if let Some(postings) = pplc_store.prefix_postings_list(reader, prefix)? { debug!("Found cached postings list for {:?}", query); - postings_lists_original_length += postings_list.len(); + postings_lists_original_length += postings.matches.len(); let input = Rc::from(&prefix[..]); - let postings_list = Rc::new(postings_list); + let postings_list = Rc::new(postings.matches); let postings_list_view = PostingsListView::original(input, postings_list); let mut offset = 0; @@ -751,11 +752,11 @@ fn split_best_frequency<'a>( let left_freq = postings_lists_store .postings_list(reader, left.as_ref())? - .map_or(0, |i| i.len()); + .map_or(0, |p| p.docids.len()); let right_freq = postings_lists_store .postings_list(reader, right.as_ref())? - .map_or(0, |i| i.len()); + .map_or(0, |p| p.docids.len()); let min_freq = cmp::min(left_freq, right_freq); if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 1e6cc1305..bef94ff4b 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -107,8 +107,14 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn, ctx: &Context, word: &' for (i, _) in chars { let (left, right) = word.split_at(i); - let left_freq = ctx.postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); - let right_freq = ctx.postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); + let left_freq = ctx.postings_lists + .postings_list(reader, left.as_bytes())? + .map(|p| p.docids.len()) + .unwrap_or(0); + let right_freq = ctx.postings_lists + .postings_list(reader, right.as_bytes())? + .map(|p| p.docids.len()) + .unwrap_or(0); let min_freq = cmp::min(left_freq, right_freq); if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { @@ -208,12 +214,12 @@ pub fn create_query_tree(reader: &heed::RoTxn, ctx: &Context, query: &str } pub struct QueryResult<'o, 'txn> { - pub docids: SetBuf, + pub docids: Cow<'txn, Set>, pub queries: HashMap<&'o Query, Cow<'txn, Set>>, } pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set>>; -pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf>; +pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set>>; pub fn traverse_query_tree<'o, 'txn>( reader: &'txn heed::RoTxn, @@ -228,7 +234,7 @@ pub fn traverse_query_tree<'o, 'txn>( postings: &mut Postings<'o, 'txn>, depth: usize, operations: &'o [Operation], - ) -> MResult> + ) -> MResult>> { println!("{:1$}AND", "", depth * 2); @@ -257,7 +263,7 @@ pub fn traverse_query_tree<'o, 'txn>( println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); - Ok(docids) + Ok(Cow::Owned(docids)) } fn execute_or<'o, 'txn>( @@ -267,7 +273,7 @@ pub fn traverse_query_tree<'o, 'txn>( postings: &mut Postings<'o, 'txn>, depth: usize, operations: &'o [Operation], - ) -> MResult> + ) -> MResult>> { println!("{:1$}OR", "", depth * 2); @@ -294,7 +300,7 @@ pub fn traverse_query_tree<'o, 'txn>( println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); - Ok(docids) + Ok(Cow::Owned(docids)) } fn execute_query<'o, 'txn>( @@ -303,7 +309,7 @@ pub fn traverse_query_tree<'o, 'txn>( postings: &mut Postings<'o, 'txn>, depth: usize, query: &'o Query, - ) -> MResult> + ) -> MResult>> { let before = Instant::now(); @@ -313,14 +319,7 @@ pub fn traverse_query_tree<'o, 'txn>( if *prefix && word.len() == 1 { let prefix = [word.as_bytes()[0], 0, 0, 0]; let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); - - let before = Instant::now(); - let mut docids: Vec<_> = matches.into_iter().map(|m| m.document_id).collect(); - docids.dedup(); - let docids = SetBuf::new(docids).unwrap(); - println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); - - docids + matches.docids } else { let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) }; @@ -333,8 +332,8 @@ pub fn traverse_query_tree<'o, 'txn>( let mut docids = Vec::new(); while let Some(input) = stream.next() { - if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { - docids.extend(matches.iter().map(|d| d.document_id)) + if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? { + docids.extend_from_slice(&postings.docids); } } @@ -342,7 +341,7 @@ pub fn traverse_query_tree<'o, 'txn>( let docids = SetBuf::from_dirty(docids); println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); - docids + Cow::Owned(docids) } }, QueryKind::Exact(word) => { @@ -358,16 +357,12 @@ pub fn traverse_query_tree<'o, 'txn>( let mut docids = Vec::new(); while let Some(input) = stream.next() { - if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { - docids.extend(matches.iter().map(|d| d.document_id)) + if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? { + docids.extend_from_slice(&postings.docids); } } - let before = Instant::now(); - let docids = SetBuf::from_dirty(docids); - println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); - - docids + Cow::Owned(SetBuf::from_dirty(docids)) }, QueryKind::Phrase(words) => { // TODO support prefix and non-prefix exact DFA @@ -375,7 +370,7 @@ pub fn traverse_query_tree<'o, 'txn>( let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default(); let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default(); - let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| { + let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| { let x = (a.document_id, a.attribute, (a.word_index as u32) + 1); let y = (b.document_id, b.attribute, b.word_index as u32); x.cmp(&y) @@ -394,10 +389,10 @@ pub fn traverse_query_tree<'o, 'txn>( println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); println!("{:2$}matches {:?}", "", matches, depth * 2); - docids + Cow::Owned(docids) } else { println!("{:2$}{:?} skipped", "", words, depth * 2); - SetBuf::default() + Cow::default() } }, }; diff --git a/meilisearch-core/src/store/mod.rs b/meilisearch-core/src/store/mod.rs index 8027dc220..6bc12231e 100644 --- a/meilisearch-core/src/store/mod.rs +++ b/meilisearch-core/src/store/mod.rs @@ -59,13 +59,13 @@ impl DocumentAttrKey { } } -#[derive(Debug)] +#[derive(Default, Debug)] pub struct Postings<'a> { pub docids: Cow<'a, Set>, pub matches: Cow<'a, Set>, } -struct PostingsCodec; +pub struct PostingsCodec; impl<'a> BytesEncode<'a> for PostingsCodec { type EItem = Postings<'a>; @@ -125,7 +125,6 @@ impl<'a> BytesDecode<'a> for PostingsCodec { fn bytes_decode(bytes: &'a [u8]) -> Option { let u64_size = mem::size_of::(); let docid_size = mem::size_of::(); - let docindex_size = mem::size_of::(); let (len_bytes, bytes) = bytes.split_at(u64_size); let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize; diff --git a/meilisearch-core/src/store/postings_lists.rs b/meilisearch-core/src/store/postings_lists.rs index 7d3a29438..3cf1a6a1f 100644 --- a/meilisearch-core/src/store/postings_lists.rs +++ b/meilisearch-core/src/store/postings_lists.rs @@ -1,14 +1,12 @@ use std::borrow::Cow; -use std::convert::TryInto; -use std::{mem, ptr}; use heed::Result as ZResult; -use heed::types::{ByteSlice, CowSlice}; +use heed::types::ByteSlice; use sdset::{Set, SetBuf}; use slice_group_by::GroupBy; use crate::database::MainT; -use crate::{DocIndex, DocumentId}; +use crate::DocIndex; use crate::store::{Postings, PostingsCodec}; #[derive(Copy, Clone)] diff --git a/meilisearch-core/src/store/prefix_postings_lists_cache.rs b/meilisearch-core/src/store/prefix_postings_lists_cache.rs index 9c99a8f91..bc0c58f52 100644 --- a/meilisearch-core/src/store/prefix_postings_lists_cache.rs +++ b/meilisearch-core/src/store/prefix_postings_lists_cache.rs @@ -1,15 +1,17 @@ use std::borrow::Cow; use heed::Result as ZResult; -use heed::types::{OwnedType, CowSlice}; +use heed::types::OwnedType; use sdset::{Set, SetBuf}; +use slice_group_by::GroupBy; -use crate::DocIndex; use crate::database::MainT; +use crate::DocIndex; +use crate::store::{PostingsCodec, Postings}; #[derive(Copy, Clone)] pub struct PrefixPostingsListsCache { - pub(crate) prefix_postings_lists_cache: heed::Database, CowSlice>, + pub(crate) prefix_postings_lists_cache: heed::Database, PostingsCodec>, } impl PrefixPostingsListsCache { @@ -17,10 +19,15 @@ impl PrefixPostingsListsCache { self, writer: &mut heed::RwTxn, prefix: [u8; 4], - postings_list: &Set, + matches: &Set, ) -> ZResult<()> { - self.prefix_postings_lists_cache.put(writer, &prefix, postings_list) + let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect(); + let docids = Cow::Owned(SetBuf::new_unchecked(docids)); + let matches = Cow::Borrowed(matches); + let postings = Postings { docids, matches }; + + self.prefix_postings_lists_cache.put(writer, &prefix, &postings) } pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> { @@ -31,12 +38,8 @@ impl PrefixPostingsListsCache { self, reader: &'txn heed::RoTxn, prefix: [u8; 4], - ) -> ZResult>>> + ) -> ZResult>> { - match self.prefix_postings_lists_cache.get(reader, &prefix)? { - Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))), - Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))), - None => Ok(None), - } + self.prefix_postings_lists_cache.get(reader, &prefix) } } diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index c77ff012a..f7b0abe24 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,8 +1,7 @@ use std::collections::HashMap; -use std::borrow::Cow; use fst::{set::OpBuilder, SetBuilder, IntoStreamer, Streamer}; -use sdset::{duo::Union, SetOperation, Set, SetBuf}; +use sdset::{duo::Union, SetOperation, Set}; use serde::{Deserialize, Serialize}; use log::debug; @@ -201,7 +200,7 @@ pub fn apply_documents_addition<'a, 'b>( // compute prefixes and store those in the PrefixPostingsListsCache. let mut stream = words_fst.into_stream(); while let Some(input) = stream.next() { - if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(Cow::into_owned) { + if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(|p| p.matches.into_owned()) { let prefix = &input[..1]; let mut arr = [0; 4]; @@ -453,7 +452,7 @@ pub fn write_documents_addition_index( delta_words_builder.insert(&word).unwrap(); let set = match postings_lists_store.postings_list(writer, &word)? { - Some(set) => Union::new(&set, &delta_set).into_set_buf(), + Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(), None => delta_set, }; diff --git a/meilisearch-core/src/update/documents_deletion.rs b/meilisearch-core/src/update/documents_deletion.rs index fec6d3ae7..ba3e3f062 100644 --- a/meilisearch-core/src/update/documents_deletion.rs +++ b/meilisearch-core/src/update/documents_deletion.rs @@ -142,8 +142,8 @@ pub fn apply_documents_deletion( for (word, document_ids) in words_document_ids { let document_ids = SetBuf::from_dirty(document_ids); - if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? { - let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); + if let Some(postings) = postings_lists_store.postings_list(writer, &word)? { + let op = DifferenceByKey::new(&postings.matches, &document_ids, |d| d.document_id, |id| *id); let doc_indexes = op.into_set_buf(); if !doc_indexes.is_empty() {