mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Rewrite the phrase query postings lists
This simplified the multiword_rewrite_matches function a little bit.
This commit is contained in:
parent
dd03a6256a
commit
8d71112dcb
@ -15,8 +15,9 @@ use levenshtein_automata::DFA;
|
|||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||||
use meilisearch_types::{DocIndex, Highlight};
|
use meilisearch_types::{DocIndex, Highlight};
|
||||||
use sdset::Set;
|
use sdset::{Set, SetBuf};
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
use itertools::EitherOrBoth;
|
||||||
|
|
||||||
use crate::automaton::NGRAMS;
|
use crate::automaton::NGRAMS;
|
||||||
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
|
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
|
||||||
@ -61,7 +62,7 @@ pub fn bucket_sort<'c>(
|
|||||||
let mut raw_documents = Vec::new();
|
let mut raw_documents = Vec::new();
|
||||||
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
prefiltered_documents += 1;
|
prefiltered_documents += 1;
|
||||||
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
|
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
|
||||||
raw_documents.push(raw_document);
|
raw_documents.push(raw_document);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -78,7 +79,7 @@ pub fn bucket_sort<'c>(
|
|||||||
|
|
||||||
let criteria = [
|
let criteria = [
|
||||||
Box::new(Typo) as Box<dyn Criterion>,
|
Box::new(Typo) as Box<dyn Criterion>,
|
||||||
Box::new(Words) as Box<dyn Criterion>,
|
Box::new(Words),
|
||||||
Box::new(Proximity),
|
Box::new(Proximity),
|
||||||
Box::new(Attribute),
|
Box::new(Attribute),
|
||||||
Box::new(WordsPosition),
|
Box::new(WordsPosition),
|
||||||
@ -154,13 +155,11 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
fn new<'txn>(
|
fn new<'txn>(
|
||||||
raw_matches: &'a mut [BareMatch<'tag>],
|
raw_matches: &'a mut [BareMatch<'tag>],
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
) -> Option<RawDocument<'a, 'tag>>
|
) -> Option<RawDocument<'a, 'tag>>
|
||||||
{
|
{
|
||||||
raw_matches.sort_unstable_by_key(|m| m.query_index);
|
raw_matches.sort_unstable_by_key(|m| m.query_index);
|
||||||
|
|
||||||
// debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);
|
|
||||||
|
|
||||||
let mut previous_word = None;
|
let mut previous_word = None;
|
||||||
for i in 0..raw_matches.len() {
|
for i in 0..raw_matches.len() {
|
||||||
let a = &raw_matches[i];
|
let a = &raw_matches[i];
|
||||||
@ -168,10 +167,17 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
|
|
||||||
match auta.phrase_query {
|
match auta.phrase_query {
|
||||||
Some((0, _)) => {
|
Some((0, _)) => {
|
||||||
previous_word = Some(a.query_index);
|
let b = match raw_matches.get(i + 1) {
|
||||||
let b = raw_matches.get(i + 1)?;
|
Some(b) => b,
|
||||||
|
None => {
|
||||||
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if a.query_index + 1 != b.query_index {
|
if a.query_index + 1 != b.query_index {
|
||||||
return None;
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
let pla = &postings_lists[a.postings_list];
|
let pla = &postings_lists[a.postings_list];
|
||||||
@ -181,11 +187,31 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
||||||
});
|
});
|
||||||
|
|
||||||
if !iter.any(|eb| eb.is_both()) { return None }
|
let mut newa = Vec::new();
|
||||||
|
let mut newb = Vec::new();
|
||||||
|
|
||||||
|
for eb in iter {
|
||||||
|
if let EitherOrBoth::Both(a, b) = eb {
|
||||||
|
newa.push(*a);
|
||||||
|
newb.push(*b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if !newa.is_empty() {
|
||||||
|
previous_word = Some(a.query_index);
|
||||||
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
||||||
|
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// TODO use SetBuf::default when merged
|
||||||
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
|
||||||
|
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
|
||||||
|
}
|
||||||
},
|
},
|
||||||
Some((1, _)) => {
|
Some((1, _)) => {
|
||||||
if previous_word.take() != Some(a.query_index - 1) {
|
if previous_word.take() != Some(a.query_index - 1) {
|
||||||
return None;
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Some((_, _)) => unreachable!(),
|
Some((_, _)) => unreachable!(),
|
||||||
@ -193,6 +219,10 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
||||||
|
return None
|
||||||
|
}
|
||||||
|
|
||||||
Some(RawDocument {
|
Some(RawDocument {
|
||||||
raw_matches,
|
raw_matches,
|
||||||
processed_matches: Vec::new(),
|
processed_matches: Vec::new(),
|
||||||
@ -231,50 +261,84 @@ pub struct SimpleMatch {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct PostingsListView<'txn> {
|
pub enum PostingsListView<'txn> {
|
||||||
|
Original {
|
||||||
input: Rc<[u8]>,
|
input: Rc<[u8]>,
|
||||||
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
|
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
len: usize,
|
len: usize,
|
||||||
|
},
|
||||||
|
Rewritten {
|
||||||
|
input: Rc<[u8]>,
|
||||||
|
postings_list: SetBuf<DocIndex>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for PostingsListView<'_> {
|
impl fmt::Debug for PostingsListView<'_> {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
f.debug_struct("PostingsListView")
|
f.debug_struct("PostingsListView")
|
||||||
.field("input", &std::str::from_utf8(&self.input).unwrap())
|
.field("input", &std::str::from_utf8(&self.input()).unwrap())
|
||||||
.field("postings_list", &self.as_ref())
|
.field("postings_list", &self.as_ref())
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'txn> PostingsListView<'txn> {
|
impl<'txn> PostingsListView<'txn> {
|
||||||
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
||||||
let len = postings_list.len();
|
let len = postings_list.len();
|
||||||
PostingsListView { input, postings_list, offset: 0, len }
|
PostingsListView::Original { input, postings_list, offset: 0, len }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
|
||||||
|
PostingsListView::Rewritten { input, postings_list }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
|
||||||
|
*self = match self {
|
||||||
|
PostingsListView::Original { input, .. } => {
|
||||||
|
PostingsListView::Rewritten { input: input.clone(), postings_list }
|
||||||
|
},
|
||||||
|
PostingsListView::Rewritten { input, .. } => {
|
||||||
|
PostingsListView::Rewritten { input: input.clone(), postings_list }
|
||||||
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
self.len
|
match self {
|
||||||
|
PostingsListView::Original { len, .. } => *len,
|
||||||
|
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn input(&self) -> &[u8] {
|
pub fn input(&self) -> &[u8] {
|
||||||
&self.input
|
match self {
|
||||||
|
PostingsListView::Original { ref input, .. } => input,
|
||||||
|
PostingsListView::Rewritten { ref input, .. } => input,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
|
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
|
||||||
assert!(offset + len <= self.len);
|
match self {
|
||||||
PostingsListView {
|
PostingsListView::Original { input, postings_list, offset, len } => {
|
||||||
input: self.input.clone(),
|
assert!(range_offset + range_len <= *len);
|
||||||
postings_list: self.postings_list.clone(),
|
PostingsListView::Original {
|
||||||
offset: self.offset + offset,
|
input: input.clone(),
|
||||||
len: len,
|
postings_list: postings_list.clone(),
|
||||||
|
offset: offset + range_offset,
|
||||||
|
len: range_len,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PostingsListView::Rewritten { .. } => {
|
||||||
|
panic!("Cannot create a range on a rewritten postings list view");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
|
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
|
||||||
fn as_ref(&self) -> &Set<DocIndex> {
|
fn as_ref(&self) -> &Set<DocIndex> {
|
||||||
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,7 +346,12 @@ impl Deref for PostingsListView<'_> {
|
|||||||
type Target = Set<DocIndex>;
|
type Target = Set<DocIndex>;
|
||||||
|
|
||||||
fn deref(&self) -> &Set<DocIndex> {
|
fn deref(&self) -> &Set<DocIndex> {
|
||||||
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
|
match *self {
|
||||||
|
PostingsListView::Original { ref postings_list, offset, len, .. } => {
|
||||||
|
Set::new_unchecked(&postings_list[offset..offset + len])
|
||||||
|
},
|
||||||
|
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -335,7 +404,7 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
|
|
||||||
let input = Rc::from(input);
|
let input = Rc::from(input);
|
||||||
let postings_list = Rc::new(postings_list);
|
let postings_list = Rc::new(postings_list);
|
||||||
let postings_list_view = PostingsListView::new(input, postings_list);
|
let postings_list_view = PostingsListView::original(input, postings_list);
|
||||||
|
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
||||||
|
@ -52,38 +52,9 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
for document in documents {
|
for document in documents {
|
||||||
if !document.processed_distances.is_empty() { continue }
|
if !document.processed_distances.is_empty() { continue }
|
||||||
|
|
||||||
// debug!("{:?}", document.raw_matches[0].document_id);
|
|
||||||
|
|
||||||
let mut processed = Vec::new();
|
let mut processed = Vec::new();
|
||||||
let mut raw_matches = document.raw_matches.iter().peekable();
|
for m in document.raw_matches.iter() {
|
||||||
while let Some(m) = raw_matches.next() {
|
if postings_lists[m.postings_list].is_empty() { continue }
|
||||||
|
|
||||||
// let automaton = &automatons[m.query_index as usize];
|
|
||||||
|
|
||||||
// debug!("{:?} {:?}", m, automaton);
|
|
||||||
// debug!("{:?}", &postings_lists[m.postings_list]);
|
|
||||||
|
|
||||||
// match automaton.phrase_query {
|
|
||||||
// Some((0, len)) => {
|
|
||||||
// match raw_matches.peek() {
|
|
||||||
// Some(BareMatch { query_index, .. }) => {
|
|
||||||
// if *query_index != m.query_index + 1 {
|
|
||||||
// raw_matches.next();
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
// },
|
|
||||||
// None => continue,
|
|
||||||
// }
|
|
||||||
// },
|
|
||||||
// Some((_, _)) => continue,
|
|
||||||
// None => (),
|
|
||||||
// }
|
|
||||||
|
|
||||||
// FIXME we really need to take splitted words into account
|
|
||||||
// those must be seen at the same level as the non-splitteds
|
|
||||||
// if automatons[m.query_index as usize].phrase_query.is_some() {
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
|
|
||||||
let range = query_enhancer.replacement(m.query_index as u32);
|
let range = query_enhancer.replacement(m.query_index as u32);
|
||||||
let new_len = cmp::max(range.end as usize, processed.len());
|
let new_len = cmp::max(range.end as usize, processed.len());
|
||||||
@ -99,8 +70,6 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// debug!("{:?}", processed);
|
|
||||||
|
|
||||||
document.processed_distances = processed;
|
document.processed_distances = processed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -444,54 +413,11 @@ impl Criterion for StableDocId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn multiword_rewrite_matches(
|
pub fn multiword_rewrite_matches(
|
||||||
simple_matches: &mut [SimpleMatch],
|
matches: &mut [SimpleMatch],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
) -> SetBuf<SimpleMatch>
|
) -> SetBuf<SimpleMatch>
|
||||||
{
|
{
|
||||||
let mut matches = Vec::with_capacity(simple_matches.len());
|
|
||||||
|
|
||||||
// let before_sort = Instant::now();
|
|
||||||
// we sort the matches by word index to make them rewritable
|
|
||||||
simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index));
|
|
||||||
// debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
|
|
||||||
|
|
||||||
for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) {
|
|
||||||
let iter = same_attribute.linear_group_by_key(|m| m.query_index);
|
|
||||||
let mut iter = iter.peekable();
|
|
||||||
|
|
||||||
while let Some(same_query_index) = iter.next() {
|
|
||||||
let query_index = same_query_index[0].query_index;
|
|
||||||
|
|
||||||
// TODO we need to support phrase query of longer length
|
|
||||||
if let Some((i, len)) = automatons[query_index as usize].phrase_query {
|
|
||||||
if i != 0 { continue }
|
|
||||||
|
|
||||||
// is the next query_index group the required one
|
|
||||||
if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) {
|
|
||||||
if let Some(next) = iter.next() {
|
|
||||||
for ma in same_query_index {
|
|
||||||
for mb in next {
|
|
||||||
if ma.word_index == mb.word_index + 1 {
|
|
||||||
matches.push(*ma);
|
|
||||||
matches.push(*mb);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
matches.extend_from_slice(same_query_index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some();
|
|
||||||
// let next_query_index = match_.query_index + 1;
|
|
||||||
// if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() {
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
|
|
||||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||||
|
|
||||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||||
|
Loading…
Reference in New Issue
Block a user