From 8d71112dcb7924edd8f3a064af3fd213a6c86185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Dec 2019 12:19:38 +0100 Subject: [PATCH] Rewrite the phrase query postings lists This simplified the multiword_rewrite_matches function a little bit. --- meilisearch-core/src/bucket_sort.rs | 131 +++++++++++++++++++++------- meilisearch-core/src/criterion2.rs | 80 +---------------- 2 files changed, 103 insertions(+), 108 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index fcbe5a262..ccbf1e756 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -15,8 +15,9 @@ use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::{DocIndex, Highlight}; -use sdset::Set; +use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; +use itertools::EitherOrBoth; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; @@ -61,7 +62,7 @@ pub fn bucket_sort<'c>( let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { raw_documents.push(raw_document); } } @@ -78,7 +79,7 @@ pub fn bucket_sort<'c>( let criteria = [ Box::new(Typo) as Box, - Box::new(Words) as Box, + Box::new(Words), Box::new(Proximity), Box::new(Attribute), Box::new(WordsPosition), @@ -154,13 +155,11 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { fn new<'txn>( raw_matches: &'a mut [BareMatch<'tag>], automatons: &[QueryWordAutomaton], - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, ) -> Option> { raw_matches.sort_unstable_by_key(|m| m.query_index); - // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches); - let mut previous_word = None; for i in 0..raw_matches.len() { let a = &raw_matches[i]; @@ -168,10 +167,17 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { match auta.phrase_query { Some((0, _)) => { - previous_word = Some(a.query_index); - let b = raw_matches.get(i + 1)?; + let b = match raw_matches.get(i + 1) { + Some(b) => b, + None => { + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + continue; + } + }; + if a.query_index + 1 != b.query_index { - return None; + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + continue } let pla = &postings_lists[a.postings_list]; @@ -181,11 +187,31 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) }); - if !iter.any(|eb| eb.is_both()) { return None } + let mut newa = Vec::new(); + let mut newb = Vec::new(); + + for eb in iter { + if let EitherOrBoth::Both(a, b) = eb { + newa.push(*a); + newb.push(*b); + } + } + + + if !newa.is_empty() { + previous_word = Some(a.query_index); + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); + + } else { + // TODO use SetBuf::default when merged + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + } }, Some((1, _)) => { if previous_word.take() != Some(a.query_index - 1) { - return None; + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); } }, Some((_, _)) => unreachable!(), @@ -193,6 +219,10 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { } } + if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + return None + } + Some(RawDocument { raw_matches, processed_matches: Vec::new(), @@ -231,50 +261,84 @@ pub struct SimpleMatch { } #[derive(Clone)] -pub struct PostingsListView<'txn> { - input: Rc<[u8]>, - postings_list: Rc>>, - offset: usize, - len: usize, +pub enum PostingsListView<'txn> { + Original { + input: Rc<[u8]>, + postings_list: Rc>>, + offset: usize, + len: usize, + }, + Rewritten { + input: Rc<[u8]>, + postings_list: SetBuf, + }, } impl fmt::Debug for PostingsListView<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("PostingsListView") - .field("input", &std::str::from_utf8(&self.input).unwrap()) + .field("input", &std::str::from_utf8(&self.input()).unwrap()) .field("postings_list", &self.as_ref()) .finish() } } impl<'txn> PostingsListView<'txn> { - pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { + pub fn original(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { let len = postings_list.len(); - PostingsListView { input, postings_list, offset: 0, len } + PostingsListView::Original { input, postings_list, offset: 0, len } + } + + pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf) -> PostingsListView<'txn> { + PostingsListView::Rewritten { input, postings_list } + } + + pub fn rewrite_with(&mut self, postings_list: SetBuf) { + *self = match self { + PostingsListView::Original { input, .. } => { + PostingsListView::Rewritten { input: input.clone(), postings_list } + }, + PostingsListView::Rewritten { input, .. } => { + PostingsListView::Rewritten { input: input.clone(), postings_list } + }, + }; } pub fn len(&self) -> usize { - self.len + match self { + PostingsListView::Original { len, .. } => *len, + PostingsListView::Rewritten { postings_list, .. } => postings_list.len(), + } } pub fn input(&self) -> &[u8] { - &self.input + match self { + PostingsListView::Original { ref input, .. } => input, + PostingsListView::Rewritten { ref input, .. } => input, + } } - pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> { - assert!(offset + len <= self.len); - PostingsListView { - input: self.input.clone(), - postings_list: self.postings_list.clone(), - offset: self.offset + offset, - len: len, + pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> { + match self { + PostingsListView::Original { input, postings_list, offset, len } => { + assert!(range_offset + range_len <= *len); + PostingsListView::Original { + input: input.clone(), + postings_list: postings_list.clone(), + offset: offset + range_offset, + len: range_len, + } + }, + PostingsListView::Rewritten { .. } => { + panic!("Cannot create a range on a rewritten postings list view"); + } } } } impl AsRef> for PostingsListView<'_> { fn as_ref(&self) -> &Set { - Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + self } } @@ -282,7 +346,12 @@ impl Deref for PostingsListView<'_> { type Target = Set; fn deref(&self) -> &Set { - Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + match *self { + PostingsListView::Original { ref postings_list, offset, len, .. } => { + Set::new_unchecked(&postings_list[offset..offset + len]) + }, + PostingsListView::Rewritten { ref postings_list, .. } => postings_list, + } } } @@ -335,7 +404,7 @@ fn fetch_matches<'txn, 'tag>( let input = Rc::from(input); let postings_list = Rc::new(postings_list); - let postings_list_view = PostingsListView::new(input, postings_list); + let postings_list_view = PostingsListView::original(input, postings_list); let mut offset = 0; for group in postings_list_view.linear_group_by_key(|di| di.document_id) { diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 3bfbe76ea..a82dbf123 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -52,38 +52,9 @@ fn prepare_query_distances<'a, 'tag, 'txn>( for document in documents { if !document.processed_distances.is_empty() { continue } - // debug!("{:?}", document.raw_matches[0].document_id); - let mut processed = Vec::new(); - let mut raw_matches = document.raw_matches.iter().peekable(); - while let Some(m) = raw_matches.next() { - - // let automaton = &automatons[m.query_index as usize]; - - // debug!("{:?} {:?}", m, automaton); - // debug!("{:?}", &postings_lists[m.postings_list]); - - // match automaton.phrase_query { - // Some((0, len)) => { - // match raw_matches.peek() { - // Some(BareMatch { query_index, .. }) => { - // if *query_index != m.query_index + 1 { - // raw_matches.next(); - // continue - // } - // }, - // None => continue, - // } - // }, - // Some((_, _)) => continue, - // None => (), - // } - - // FIXME we really need to take splitted words into account - // those must be seen at the same level as the non-splitteds - // if automatons[m.query_index as usize].phrase_query.is_some() { - // continue - // } + for m in document.raw_matches.iter() { + if postings_lists[m.postings_list].is_empty() { continue } let range = query_enhancer.replacement(m.query_index as u32); let new_len = cmp::max(range.end as usize, processed.len()); @@ -99,8 +70,6 @@ fn prepare_query_distances<'a, 'tag, 'txn>( } } - // debug!("{:?}", processed); - document.processed_distances = processed; } } @@ -444,54 +413,11 @@ impl Criterion for StableDocId { } pub fn multiword_rewrite_matches( - simple_matches: &mut [SimpleMatch], + matches: &mut [SimpleMatch], query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) -> SetBuf { - let mut matches = Vec::with_capacity(simple_matches.len()); - - // let before_sort = Instant::now(); - // we sort the matches by word index to make them rewritable - simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index)); - // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - - for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) { - let iter = same_attribute.linear_group_by_key(|m| m.query_index); - let mut iter = iter.peekable(); - - while let Some(same_query_index) = iter.next() { - let query_index = same_query_index[0].query_index; - - // TODO we need to support phrase query of longer length - if let Some((i, len)) = automatons[query_index as usize].phrase_query { - if i != 0 { continue } - - // is the next query_index group the required one - if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) { - if let Some(next) = iter.next() { - for ma in same_query_index { - for mb in next { - if ma.word_index == mb.word_index + 1 { - matches.push(*ma); - matches.push(*mb); - } - } - } - } - } - } else { - matches.extend_from_slice(same_query_index); - } - } - } - - // let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some(); - // let next_query_index = match_.query_index + 1; - // if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() { - // continue - // } - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); let mut padded_matches = Vec::with_capacity(matches.len());