feat: Introduce the Highlight type to simplify the data oriented design

2025-06-30 02:18:31 +02:00 · 2019-06-25 12:27:15 +02:00 · 2019-06-25 12:27:15 +02:00 · 6b6db2f8e6
commit 6b6db2f8e6
parent b7ed22bc59
2 changed files with 52 additions and 123 deletions
--- a/meilidb-core/src/lib.rs
+++ b/meilidb-core/src/lib.rs
@ -60,97 +60,43 @@ pub struct DocIndex {
 ///
 /// The order of the field is important because it defines
 /// the way these structures are ordered between themselves.
-///
-/// The word in itself is not important.
-// TODO do data oriented programming ? very arrays ?
 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Match {
-    /// The word index in the query sentence.
-    /// Same as the `attribute_index` but for the query words.
-    ///
-    /// Used to retrieve the automaton that match this word.
-    pub query_index: u32,
-
-    /// The distance the word has with the query word
-    /// (i.e. the Levenshtein distance).
-    pub distance: u8,
-
+pub struct Highlight {
    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: u16,
-    pub word_index: u16,

-    /// Whether the word that match is an exact match or a prefix.
-    pub is_exact: bool,
-
-    /// The position in bytes where the word was found
-    /// along with the length of it.
+    /// The position in bytes where the word was found.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
    pub char_index: u16,
+
+    /// The length in bytes of the found word.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
    pub char_length: u16,
 }

-impl Match {
-    pub fn zero() -> Self {
-        Match {
-            query_index: 0,
-            distance: 0,
-            attribute: 0,
-            word_index: 0,
-            is_exact: false,
-            char_index: 0,
-            char_length: 0,
-        }
-    }
-
-    pub fn max() -> Self {
-        Match {
-            query_index: u32::max_value(),
-            distance: u8::max_value(),
-            attribute: u16::max_value(),
-            word_index: u16::max_value(),
-            is_exact: true,
-            char_index: u16::max_value(),
-            char_length: u16::max_value(),
-        }
-    }
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
+struct TmpMatch {
+    pub query_index: u32,
+    pub distance: u8,
+    pub attribute: u16,
+    pub word_index: u16,
+    pub is_exact: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Document {
    pub id: DocumentId,
-    pub matches: Vec<Match>,
+    pub highlights: Vec<Highlight>,
 }

 impl Document {
-    fn from_raw(raw: &RawDocument) -> Document {
-        let len = raw.matches.range.len();
-        let mut matches = Vec::with_capacity(len);
-
-        let query_index = raw.query_index();
-        let distance = raw.distance();
-        let attribute = raw.attribute();
-        let word_index = raw.word_index();
-        let is_exact = raw.is_exact();
-        let char_index = raw.char_index();
-        let char_length = raw.char_length();
-
-        for i in 0..len {
-            let match_ = Match {
-                query_index: query_index[i],
-                distance: distance[i],
-                attribute: attribute[i],
-                word_index: word_index[i],
-                is_exact: is_exact[i],
-                char_index: char_index[i],
-                char_length: char_length[i],
-            };
-            matches.push(match_);
-        }
-
-        Document { id: raw.id, matches }
+    fn from_raw(raw: RawDocument) -> Document {
+        Document { id: raw.id, highlights: raw.highlights }
    }
 }

@ -158,11 +104,12 @@ impl Document {
 pub struct RawDocument {
    pub id: DocumentId,
    pub matches: SharedMatches,
+    pub highlights: Vec<Highlight>,
 }

 impl RawDocument {
-    fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
-        RawDocument { id, matches: SharedMatches { range, matches } }
+    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
+        RawDocument { id, matches, highlights }
    }

    pub fn query_index(&self) -> &[u32] {
@ -199,20 +146,6 @@ impl RawDocument {
        // can only be done in this module
        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
    }
-
-    pub fn char_index(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
-    }
-
-    pub fn char_length(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
-    }
 }

 impl fmt::Debug for RawDocument {
@ -224,27 +157,30 @@ impl fmt::Debug for RawDocument {
            .field("attribute", &self.attribute())
            .field("word_index", &self.word_index())
            .field("is_exact", &self.is_exact())
-            .field("char_index", &self.char_index())
-            .field("char_length", &self.char_length())
            .finish()
    }
 }

-pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec<RawDocument> {
-    let mut docs_ranges = Vec::<(_, Range)>::new();
+fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec<RawDocument> {
+    let mut docs_ranges = Vec::<(DocumentId, Range, Vec<Highlight>)>::new();
    let mut matches2 = Matches::with_capacity(matches.len());

-    for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
-        let id = group[0].0;
-        let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
+    for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) {
+        let document_id = group[0].0;
+        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
        let end = start + group.len();
-        docs_ranges.push((id, Range { start, end }));
+
+        let highlights = group.iter().map(|(_, _, h)| *h).collect();
+        docs_ranges.push((document_id, Range { start, end }, highlights));

        matches2.extend_from_slice(group);
    }

    let matches = Arc::new(matches2);
-    docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
+    docs_ranges.into_iter().map(|(i, range, highlights)| {
+        let matches = SharedMatches { range, matches: matches.clone() };
+        RawDocument::new(i, matches, highlights)
+    }).collect()
 }

 #[derive(Debug, Copy, Clone)]
@ -253,12 +189,6 @@ struct Range {
    end: usize,
 }

-impl Range {
-    fn len(self) -> usize {
-        self.end - self.start
-    }
-}
-
 #[derive(Clone)]
 pub struct SharedMatches {
    range: Range,
@ -272,8 +202,6 @@ struct Matches {
    attribute: Vec<u16>,
    word_index: Vec<u16>,
    is_exact: Vec<bool>,
-    char_index: Vec<u16>,
-    char_length: Vec<u16>,
 }

 impl Matches {
@ -284,25 +212,20 @@ impl Matches {
            attribute: Vec::with_capacity(cap),
            word_index: Vec::with_capacity(cap),
            is_exact: Vec::with_capacity(cap),
-            char_index: Vec::with_capacity(cap),
-            char_length: Vec::with_capacity(cap),
        }
    }

-    fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
-        for (_, match_) in matches {
+    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) {
+        for (_, match_, _) in matches {
            self.query_index.push(match_.query_index);
            self.distance.push(match_.distance);
            self.attribute.push(match_.attribute);
            self.word_index.push(match_.word_index);
            self.is_exact.push(match_.is_exact);
-            self.char_index.push(match_.char_index);
-            self.char_length.push(match_.char_length);
        }
    }
 }

-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -5,7 +5,7 @@ use std::time::Instant;
 use std::{cmp, mem};

 use fst::{Streamer, IntoStreamer};
-use hashbrown::{HashMap, HashSet};
+use hashbrown::HashMap;
 use log::info;
 use meilidb_tokenizer::{is_cjk, split_query_string};
 use rayon::slice::ParallelSliceMut;
@ -18,7 +18,7 @@ use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
 use crate::criterion::Criteria;
 use crate::raw_documents_from_matches;
 use crate::reordered_attrs::ReorderedAttrs;
-use crate::{Match, DocumentId, Store, RawDocument, Document};
+use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document};

 const NGRAMS: usize = 3;

@ -178,12 +178,12 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
    Ok(automatons)
 }

-fn rewrite_matched_positions(matches: &mut [(DocumentId, Match)]) {
-    for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
+fn rewrite_matched_positions(matches: &mut [(DocumentId, TmpMatch, Highlight)]) {
+    for document_matches in matches.linear_group_by_mut(|(a, _, _), (b, _, _)| a == b) {
        let mut offset = 0;
-        for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
+        for query_indexes in document_matches.linear_group_by_mut(|(_, a, _), (_, b, _)| a.query_index == b.query_index) {
            let word_index = query_indexes[0].1.word_index - offset as u16;
-            for (_, match_) in query_indexes.iter_mut() {
+            for (_, match_, _) in query_indexes.iter_mut() {
                match_.word_index = word_index;
            }
            offset += query_indexes.len() - 1;
@ -268,17 +268,19 @@ where S: Store,
                for di in doc_indexes.as_slice() {
                    let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
                    if let Some(attribute) = attribute {
-                        let match_ = Match {
+                        let match_ = TmpMatch {
                            query_index: query_index as u32,
                            distance,
                            attribute,
                            word_index: di.word_index,
                            is_exact,
+                        };
+                        let highlight = Highlight {
+                            attribute: di.attribute,
                            char_index: di.char_index,
                            char_length: di.char_length,
                        };
-                        matches.push((di.document_id, match_));
-
+                        matches.push((di.document_id, match_, highlight));
                    }
                }
            }
@ -289,7 +291,11 @@ where S: Store,
        rewrite_matched_positions(&mut matches);

        let total_matches = matches.len();
-        let padded_matches = SetBuf::from_dirty(matches);
+        let padded_matches = {
+            matches.par_sort_unstable();
+            matches.dedup();
+            SetBuf::new_unchecked(matches)
+        };
        let raw_documents = raw_documents_from_matches(padded_matches);

        info!("{} total documents to classify", raw_documents.len());
@ -349,7 +355,7 @@ where S: Store,

        let offset = cmp::min(documents.len(), range.start);
        let iter = documents.into_iter().skip(offset).take(range.len());
-        Ok(iter.map(|d| Document::from_raw(&d)).collect())
+        Ok(iter.map(|d| Document::from_raw(d)).collect())
    }
 }

@ -476,7 +482,7 @@ where S: Store,
                };

                if distinct_accepted && seen.len() > range.start {
-                    out_documents.push(Document::from_raw(&document));
+                    out_documents.push(Document::from_raw(document));
                    if out_documents.len() == range.len() { break }
                }
            }