Rewrite the phrase query postings lists

This simplified the multiword_rewrite_matches function a little bit.
2025-07-04 04:17:10 +02:00 · 2019-12-10 12:19:38 +01:00 · 2019-12-10 12:19:38 +01:00 · 8d71112dcb
commit 8d71112dcb
parent dd03a6256a
2 changed files with 103 additions and 108 deletions
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -15,8 +15,9 @@ use levenshtein_automata::DFA;
 use log::debug;
 use meilisearch_tokenizer::{is_cjk, split_query_string};
 use meilisearch_types::{DocIndex, Highlight};
-use sdset::Set;
+use sdset::{Set, SetBuf};
 use slice_group_by::{GroupBy, GroupByMut};
 use itertools::EitherOrBoth;
 use crate::automaton::NGRAMS;
 use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
@ -61,7 +62,7 @@ pub fn bucket_sort<'c>(
    let mut raw_documents = Vec::new();
    for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
        prefiltered_documents += 1;
-        if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
+        if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
            raw_documents.push(raw_document);
        }
    }
@ -78,7 +79,7 @@ pub fn bucket_sort<'c>(
    let criteria = [
        Box::new(Typo) as Box<dyn Criterion>,
-        Box::new(Words) as Box<dyn Criterion>,
+        Box::new(Words),
        Box::new(Proximity),
        Box::new(Attribute),
        Box::new(WordsPosition),
@ -154,13 +155,11 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
    fn new<'txn>(
        raw_matches: &'a mut [BareMatch<'tag>],
        automatons: &[QueryWordAutomaton],
-        postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
+        postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
    ) -> Option<RawDocument<'a, 'tag>>
    {
        raw_matches.sort_unstable_by_key(|m| m.query_index);
        // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);
        let mut previous_word = None;
        for i in 0..raw_matches.len() {
            let a = &raw_matches[i];
@ -168,10 +167,17 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
            match auta.phrase_query {
                Some((0, _)) => {
-                    previous_word = Some(a.query_index);
+                    let b = match raw_matches.get(i + 1) {
-                    let b = raw_matches.get(i + 1)?;
+                        Some(b) => b,
                        None => {
                            postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
                            continue;
                        }
                    };
                    if a.query_index + 1 != b.query_index {
-                        return None;
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
                        continue
                    }
                    let pla = &postings_lists[a.postings_list];
@ -181,11 +187,31 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
                        a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
                    });
-                    if !iter.any(|eb| eb.is_both()) { return None }
+                    let mut newa = Vec::new();
                    let mut newb = Vec::new();
                    for eb in iter {
                        if let EitherOrBoth::Both(a, b) = eb {
                            newa.push(*a);
                            newb.push(*b);
                        }
                    }
                    if !newa.is_empty() {
                        previous_word = Some(a.query_index);
                        postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
                        postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
                    } else {
                        // TODO use SetBuf::default when merged
                        postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
                        postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
                    }
                },
                Some((1, _)) => {
                    if previous_word.take() != Some(a.query_index - 1) {
-                        return None;
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new()));
                    }
                },
                Some((_, _)) => unreachable!(),
@ -193,6 +219,10 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
            }
        }
        if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
            return None
        }
        Some(RawDocument {
            raw_matches,
            processed_matches: Vec::new(),
@ -231,50 +261,84 @@ pub struct SimpleMatch {
 }
 #[derive(Clone)]
-pub struct PostingsListView<'txn> {
+pub enum PostingsListView<'txn> {
    Original {
        input: Rc<[u8]>,
        postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
        offset: usize,
        len: usize,
    },
    Rewritten {
        input: Rc<[u8]>,
        postings_list: SetBuf<DocIndex>,
    },
 }
 impl fmt::Debug for PostingsListView<'_> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("PostingsListView")
-            .field("input", &std::str::from_utf8(&self.input).unwrap())
+            .field("input", &std::str::from_utf8(&self.input()).unwrap())
            .field("postings_list", &self.as_ref())
            .finish()
    }
 }
 impl<'txn> PostingsListView<'txn> {
-    pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
+    pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
        let len = postings_list.len();
-        PostingsListView { input, postings_list, offset: 0, len }
+        PostingsListView::Original { input, postings_list, offset: 0, len }
    }
    pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
        PostingsListView::Rewritten { input, postings_list }
    }
    pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
        *self = match self {
            PostingsListView::Original { input, .. } => {
                PostingsListView::Rewritten { input: input.clone(), postings_list }
            },
            PostingsListView::Rewritten { input, .. } => {
                PostingsListView::Rewritten { input: input.clone(), postings_list }
            },
        };
    }
    pub fn len(&self) -> usize {
-        self.len
+        match self {
            PostingsListView::Original { len, .. } => *len,
            PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
        }
    }
    pub fn input(&self) -> &[u8] {
-        &self.input
+        match self {
            PostingsListView::Original { ref input, .. } => input,
            PostingsListView::Rewritten { ref input, .. } => input,
        }
    }
-    pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
+    pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
-        assert!(offset + len <= self.len);
+        match self {
-        PostingsListView {
+            PostingsListView::Original { input, postings_list, offset, len } => {
-            input: self.input.clone(),
+                assert!(range_offset + range_len <= *len);
-            postings_list: self.postings_list.clone(),
+                PostingsListView::Original {
-            offset: self.offset + offset,
+                    input: input.clone(),
-            len: len,
+                    postings_list: postings_list.clone(),
                    offset: offset + range_offset,
                    len: range_len,
                }
            },
            PostingsListView::Rewritten { .. } => {
                panic!("Cannot create a range on a rewritten postings list view");
            }
        }
    }
 }
 impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
    fn as_ref(&self) -> &Set<DocIndex> {
-        Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
+        self
    }
 }
@ -282,7 +346,12 @@ impl Deref for PostingsListView<'_> {
    type Target = Set<DocIndex>;
    fn deref(&self) -> &Set<DocIndex> {
-        Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
+        match *self {
            PostingsListView::Original { ref postings_list, offset, len, .. } => {
                Set::new_unchecked(&postings_list[offset..offset + len])
            },
            PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
        }
    }
 }
@ -335,7 +404,7 @@ fn fetch_matches<'txn, 'tag>(
                let input = Rc::from(input);
                let postings_list = Rc::new(postings_list);
-                let postings_list_view = PostingsListView::new(input, postings_list);
+                let postings_list_view = PostingsListView::original(input, postings_list);
                let mut offset = 0;
                for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
--- a/meilisearch-core/src/criterion2.rs
+++ b/meilisearch-core/src/criterion2.rs
@ -52,38 +52,9 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
    for document in documents {
        if !document.processed_distances.is_empty() { continue }
        // debug!("{:?}", document.raw_matches[0].document_id);
        let mut processed = Vec::new();
-        let mut raw_matches = document.raw_matches.iter().peekable();
+        for m in document.raw_matches.iter() {
-        while let Some(m) = raw_matches.next() {
+            if postings_lists[m.postings_list].is_empty() { continue }
            // let automaton = &automatons[m.query_index as usize];
            // debug!("{:?} {:?}", m, automaton);
            // debug!("{:?}", &postings_lists[m.postings_list]);
            // match automaton.phrase_query {
            //     Some((0, len)) => {
            //         match raw_matches.peek() {
            //             Some(BareMatch { query_index, .. }) => {
            //                 if *query_index != m.query_index + 1 {
            //                     raw_matches.next();
            //                     continue
            //                 }
            //             },
            //             None => continue,
            //         }
            //     },
            //     Some((_, _)) => continue,
            //     None => (),
            // }
            // FIXME we really need to take splitted words into account
            //       those must be seen at the same level as the non-splitteds
            // if automatons[m.query_index as usize].phrase_query.is_some() {
            //     continue
            // }
            let range = query_enhancer.replacement(m.query_index as u32);
            let new_len = cmp::max(range.end as usize, processed.len());
@ -99,8 +70,6 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
            }
        }
        // debug!("{:?}", processed);
        document.processed_distances = processed;
    }
 }
@ -444,54 +413,11 @@ impl Criterion for StableDocId {
 }
 pub fn multiword_rewrite_matches(
-    simple_matches: &mut [SimpleMatch],
+    matches: &mut [SimpleMatch],
    query_enhancer: &QueryEnhancer,
    automatons: &[QueryWordAutomaton],
 ) -> SetBuf<SimpleMatch>
 {
    let mut matches = Vec::with_capacity(simple_matches.len());
    // let before_sort = Instant::now();
    // we sort the matches by word index to make them rewritable
    simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index));
    // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
    for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) {
        let iter = same_attribute.linear_group_by_key(|m| m.query_index);
        let mut iter = iter.peekable();
        while let Some(same_query_index) = iter.next() {
            let query_index = same_query_index[0].query_index;
            // TODO we need to support phrase query of longer length
            if let Some((i, len)) = automatons[query_index as usize].phrase_query {
                if i != 0 { continue }
                // is the next query_index group the required one
                if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) {
                    if let Some(next) = iter.next() {
                        for ma in same_query_index {
                            for mb in next {
                                if ma.word_index == mb.word_index + 1 {
                                    matches.push(*ma);
                                    matches.push(*mb);
                                }
                            }
                        }
                    }
                }
            } else {
                matches.extend_from_slice(same_query_index);
            }
        }
    }
    // let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some();
    // let next_query_index = match_.query_index + 1;
    // if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() {
    //     continue
    // }
    matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
    let mut padded_matches = Vec::with_capacity(matches.len());