Merge pull request #66 from Kerollmops/revert-precompute-query-index-groups

Revert precompute query index groups
2025-07-04 04:17:10 +02:00 · 2019-01-06 22:38:44 +01:00 · 2019-01-06 22:38:44 +01:00 · a2f5e8aa25
commit a2f5e8aa25
parent 023f62b0ce f00b978801
9 changed files with 68 additions and 150 deletions
--- a/examples/query-database.rs
+++ b/examples/query-database.rs
@ -116,7 +116,7 @@ fn main() -> Result<(), Box<Error>> {
                        };

                        print!("{}: ", name);
-                        let areas = create_highlight_areas(&text, doc.matches.as_matches(), attr);
+                        let areas = create_highlight_areas(&text, &doc.matches, attr);
                        display_highlights(&text, &areas)?;
                        println!();
                    }
@ -125,7 +125,7 @@ fn main() -> Result<(), Box<Error>> {
            }

            let mut matching_attributes = HashSet::new();
-            for _match in doc.matches.as_matches() {
+            for _match in doc.matches {
                let attr = SchemaAttr::new(_match.attribute.attribute());
                let name = schema.attribute_name(attr);
                matching_attributes.insert(name);
--- a/src/rank/criterion/exact.rs
+++ b/src/rank/criterion/exact.rs
@ -2,8 +2,9 @@ use std::cmp::Ordering;
 use std::ops::Deref;

 use rocksdb::DB;
+use group_by::GroupBy;

-use crate::rank::{Document, Matches};
+use crate::rank::{match_query_index, Document};
 use crate::rank::criterion::Criterion;
 use crate::database::DatabaseView;
 use crate::Match;
@ -14,8 +15,8 @@ fn contains_exact(matches: &&[Match]) -> bool {
 }

 #[inline]
-fn number_exact_matches(matches: &Matches) -> usize {
-    matches.query_index_groups().filter(contains_exact).count()
+fn number_exact_matches(matches: &[Match]) -> usize {
+    GroupBy::new(matches, match_query_index).filter(contains_exact).count()
 }

 #[derive(Debug, Clone, Copy)]
--- a/src/rank/criterion/number_of_words.rs
+++ b/src/rank/criterion/number_of_words.rs
@ -2,14 +2,16 @@ use std::cmp::Ordering;
 use std::ops::Deref;

 use rocksdb::DB;
+use group_by::GroupBy;

-use crate::rank::{Document, Matches};
+use crate::rank::{match_query_index, Document};
 use crate::rank::criterion::Criterion;
 use crate::database::DatabaseView;
+use crate::Match;

 #[inline]
-fn number_of_query_words(matches: &Matches) -> usize {
-    matches.query_index_groups().count()
+fn number_of_query_words(matches: &[Match]) -> usize {
+    GroupBy::new(matches, match_query_index).count()
 }

 #[derive(Debug, Clone, Copy)]
--- a/src/rank/criterion/sum_of_typos.rs
+++ b/src/rank/criterion/sum_of_typos.rs
@ -3,19 +3,22 @@ use std::ops::Deref;

 use rocksdb::DB;

-use crate::rank::{Document, Matches};
+use group_by::GroupBy;
+
+use crate::rank::{match_query_index, Document};
 use crate::rank::criterion::Criterion;
 use crate::database::DatabaseView;
+use crate::Match;

 #[inline]
-fn sum_matches_typos(matches: &Matches) -> i8 {
+fn sum_matches_typos(matches: &[Match]) -> isize {
    let mut sum_typos = 0;
    let mut number_words = 0;

    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
-    for group in matches.query_index_groups() {
-        sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
+    for group in GroupBy::new(matches, match_query_index) {
+        sum_typos += unsafe { group.get_unchecked(0).distance } as isize;
        number_words += 1;
    }

@ -41,7 +44,7 @@ where D: Deref<Target=DB>
 mod tests {
    use super::*;

-    use crate::{Match, DocumentId, Attribute, WordArea};
+    use crate::{DocumentId, Attribute, WordArea};

    // typing: "Geox CEO"
    //
@ -66,7 +69,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(0), matches)
+            Document {
+                id: DocumentId(0),
+                matches: matches,
+            }
        };

        let doc1 = {
@ -86,7 +92,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(1), matches)
+            Document {
+                id: DocumentId(1),
+                matches: matches,
+            }
        };

        let lhs = sum_matches_typos(&doc0.matches);
@ -117,7 +126,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(0), matches)
+            Document {
+                id: DocumentId(0),
+                matches: matches,
+            }
        };

        let doc1 = {
@ -130,7 +142,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(1), matches)
+            Document {
+                id: DocumentId(1),
+                matches: matches,
+            }
        };

        let lhs = sum_matches_typos(&doc0.matches);
@ -161,7 +176,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(0), matches)
+            Document {
+                id: DocumentId(0),
+                matches: matches,
+            }
        };

        let doc1 = {
@ -174,7 +192,10 @@ mod tests {
                    word_area: WordArea::new_faillible(0, 6)
                },
            ];
-            Document::from_unsorted_matches(DocumentId(1), matches)
+            Document {
+                id: DocumentId(1),
+                matches: matches,
+            }
        };

        let lhs = sum_matches_typos(&doc0.matches);
--- a/src/rank/criterion/sum_of_words_attribute.rs
+++ b/src/rank/criterion/sum_of_words_attribute.rs
@ -2,16 +2,18 @@ use std::cmp::Ordering;
 use std::ops::Deref;

 use rocksdb::DB;
+use group_by::GroupBy;

-use crate::rank::{Document, Matches};
 use crate::database::DatabaseView;
+use crate::rank::{match_query_index, Document};
 use crate::rank::criterion::Criterion;
+use crate::Match;

 #[inline]
-fn sum_matches_attributes(matches: &Matches) -> usize {
+fn sum_matches_attributes(matches: &[Match]) -> usize {
    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
-    matches.query_index_groups().map(|group| {
+    GroupBy::new(matches, match_query_index).map(|group| {
        unsafe { group.get_unchecked(0).attribute.attribute() as usize }
    }).sum()
 }
--- a/src/rank/criterion/sum_of_words_position.rs
+++ b/src/rank/criterion/sum_of_words_position.rs
@ -2,16 +2,18 @@ use std::cmp::Ordering;
 use std::ops::Deref;

 use rocksdb::DB;
+use group_by::GroupBy;

-use crate::rank::{Document, Matches};
-use crate::rank::criterion::Criterion;
 use crate::database::DatabaseView;
+use crate::rank::{match_query_index, Document};
+use crate::rank::criterion::Criterion;
+use crate::Match;

 #[inline]
-fn sum_matches_attribute_index(matches: &Matches) -> usize {
+fn sum_matches_attribute_index(matches: &[Match]) -> usize {
    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
-    matches.query_index_groups().map(|group| {
+    GroupBy::new(matches, match_query_index).map(|group| {
        unsafe { group.get_unchecked(0).attribute.word_index() as usize }
    }).sum()
 }
--- a/src/rank/criterion/words_proximity.rs
+++ b/src/rank/criterion/words_proximity.rs
@ -2,8 +2,9 @@ use std::cmp::{self, Ordering};
 use std::ops::Deref;

 use rocksdb::DB;
+use group_by::GroupBy;

-use crate::rank::{Document, Matches};
+use crate::rank::{match_query_index, Document};
 use crate::rank::criterion::Criterion;
 use crate::database::DatabaseView;
 use crate::Match;
@ -33,9 +34,9 @@ fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
    min_prox
 }

-fn matches_proximity(matches: &Matches) -> u32 {
+fn matches_proximity(matches: &[Match]) -> u32 {
    let mut proximity = 0;
-    let mut iter = matches.query_index_groups();
+    let mut iter = GroupBy::new(matches, match_query_index);

    // iterate over groups by windows of size 2
    let mut last = iter.next();
@ -90,8 +91,7 @@ mod tests {
        //   soup -> of = 8
        // + of -> the  = 1
        // + the -> day = 8 (not 1)
-        let matches = Matches::from_unsorted(matches.to_vec());
-        assert_eq!(matches_proximity(&matches), 17);
+        assert_eq!(matches_proximity(matches), 17);
    }

    #[test]
@ -118,8 +118,7 @@ mod tests {
        //   soup -> of = 1
        // + of -> the  = 1
        // + the -> day = 1
-        let matches = Matches::from_unsorted(matches.to_vec());
-        assert_eq!(matches_proximity(&matches), 3);
+        assert_eq!(matches_proximity(matches), 3);
    }
 }

@ -153,8 +152,6 @@ mod bench {
            matches.push(match_);
        }

-        let matches = Matches::from_unsorted(matches.to_vec());
-
        bench.iter(|| {
            let proximity = matches_proximity(&matches);
            test::black_box(move || proximity)
--- a/src/rank/mod.rs
+++ b/src/rank/mod.rs
@ -2,13 +2,6 @@ pub mod criterion;
 mod query_builder;
 mod distinct_map;

-use std::iter::FusedIterator;
-use std::slice::Iter;
-use std::ops::Range;
-
-use sdset::SetBuf;
-use group_by::GroupBy;
-
 use crate::{Match, DocumentId};

 pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
@ -21,120 +14,20 @@ fn match_query_index(a: &Match, b: &Match) -> bool {
 #[derive(Debug, Clone)]
 pub struct Document {
    pub id: DocumentId,
-    pub matches: Matches,
+    pub matches: Vec<Match>,
 }

 impl Document {
-    pub fn new(id: DocumentId, match_: Match) -> Self {
-        let matches = SetBuf::new_unchecked(vec![match_]);
-        Self::from_matches(id, matches)
+    pub fn new(doc: DocumentId, match_: Match) -> Self {
+        unsafe { Self::from_sorted_matches(doc, vec![match_]) }
    }

-    pub fn from_matches(id: DocumentId, matches: SetBuf<Match>) -> Self {
-        let matches = Matches::new(matches);
-        Self { id, matches }
-    }
-
-    pub fn from_unsorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
-        let matches = Matches::from_unsorted(matches);
-        Self { id, matches }
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct Matches {
-    matches: SetBuf<Match>,
-    slices: Vec<Range<usize>>,
-}
-
-impl Matches {
-    pub fn new(matches: SetBuf<Match>) -> Matches {
-        let mut last_end = 0;
-        let mut slices = Vec::new();
-
-        for group in GroupBy::new(&matches, match_query_index) {
-            let start = last_end;
-            let end = last_end + group.len();
-            slices.push(Range { start, end });
-            last_end = end;
-        }
-
-        Matches { matches, slices }
-    }
-
-    pub fn from_unsorted(mut matches: Vec<Match>) -> Matches {
+    pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
        matches.sort_unstable();
-        let matches = SetBuf::new_unchecked(matches);
-        Matches::new(matches)
+        unsafe { Self::from_sorted_matches(doc, matches) }
    }

-    pub fn query_index_groups(&self) -> QueryIndexGroups {
-        QueryIndexGroups {
-            matches: &self.matches,
-            slices: self.slices.iter(),
-        }
-    }
-
-    pub fn as_matches(&self) -> &[Match] {
-        &self.matches
-    }
-}
-
-pub struct QueryIndexGroups<'a, 'b> {
-    matches: &'a [Match],
-    slices: Iter<'b, Range<usize>>,
-}
-
-impl<'a> Iterator for QueryIndexGroups<'a, '_> {
-    type Item = &'a [Match];
-
-    #[inline]
-    fn next(&mut self) -> Option<Self::Item> {
-        self.slices.next().cloned().map(|range| {
-            unsafe { self.matches.get_unchecked(range) }
-        })
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.slices.size_hint()
-    }
-
-    #[inline]
-    fn count(self) -> usize {
-        self.len()
-    }
-
-    #[inline]
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        self.slices.nth(n).cloned().map(|range| {
-            unsafe { self.matches.get_unchecked(range) }
-        })
-    }
-
-    #[inline]
-    fn last(self) -> Option<Self::Item> {
-        let (matches, slices) = (self.matches, self.slices);
-        slices.last().cloned().map(|range| {
-            unsafe { matches.get_unchecked(range) }
-        })
-    }
-}
-
-impl ExactSizeIterator for QueryIndexGroups<'_, '_> {
-    #[inline]
-    fn len(&self) -> usize {
-        self.slices.len()
-    }
-}
-
-impl FusedIterator for QueryIndexGroups<'_, '_> { }
-
-impl DoubleEndedIterator for QueryIndexGroups<'_, '_> {
-    #[inline]
-    fn next_back(&mut self) -> Option<Self::Item> {
-        self.slices.next_back().cloned().map(|range| {
-            unsafe { self.matches.get_unchecked(range) }
-        })
+    pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
+        Self { id, matches }
    }
 }
--- a/src/rank/query_builder.rs
+++ b/src/rank/query_builder.rs
@ -119,7 +119,7 @@ where D: Deref<Target=DB>,

        info!("{} documents to classify", matches.len());

-        matches.into_iter().map(|(i, m)| Document::from_unsorted_matches(i, m)).collect()
+        matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
    }
 }