mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
feat: Introduce the Highlight type to simplify the data oriented design
This commit is contained in:
parent
b7ed22bc59
commit
6b6db2f8e6
@ -60,97 +60,43 @@ pub struct DocIndex {
|
|||||||
///
|
///
|
||||||
/// The order of the field is important because it defines
|
/// The order of the field is important because it defines
|
||||||
/// the way these structures are ordered between themselves.
|
/// the way these structures are ordered between themselves.
|
||||||
///
|
|
||||||
/// The word in itself is not important.
|
|
||||||
// TODO do data oriented programming ? very arrays ?
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct Match {
|
pub struct Highlight {
|
||||||
/// The word index in the query sentence.
|
|
||||||
/// Same as the `attribute_index` but for the query words.
|
|
||||||
///
|
|
||||||
/// Used to retrieve the automaton that match this word.
|
|
||||||
pub query_index: u32,
|
|
||||||
|
|
||||||
/// The distance the word has with the query word
|
|
||||||
/// (i.e. the Levenshtein distance).
|
|
||||||
pub distance: u8,
|
|
||||||
|
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: u16,
|
pub attribute: u16,
|
||||||
pub word_index: u16,
|
|
||||||
|
|
||||||
/// Whether the word that match is an exact match or a prefix.
|
/// The position in bytes where the word was found.
|
||||||
pub is_exact: bool,
|
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
|
||||||
/// along with the length of it.
|
|
||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub char_index: u16,
|
pub char_index: u16,
|
||||||
|
|
||||||
|
/// The length in bytes of the found word.
|
||||||
|
///
|
||||||
|
/// It informs on the original word area in the text indexed
|
||||||
|
/// without needing to run the tokenizer again.
|
||||||
pub char_length: u16,
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Match {
|
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
|
||||||
pub fn zero() -> Self {
|
struct TmpMatch {
|
||||||
Match {
|
pub query_index: u32,
|
||||||
query_index: 0,
|
pub distance: u8,
|
||||||
distance: 0,
|
pub attribute: u16,
|
||||||
attribute: 0,
|
pub word_index: u16,
|
||||||
word_index: 0,
|
pub is_exact: bool,
|
||||||
is_exact: false,
|
|
||||||
char_index: 0,
|
|
||||||
char_length: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn max() -> Self {
|
|
||||||
Match {
|
|
||||||
query_index: u32::max_value(),
|
|
||||||
distance: u8::max_value(),
|
|
||||||
attribute: u16::max_value(),
|
|
||||||
word_index: u16::max_value(),
|
|
||||||
is_exact: true,
|
|
||||||
char_index: u16::max_value(),
|
|
||||||
char_length: u16::max_value(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: Vec<Match>,
|
pub highlights: Vec<Highlight>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Document {
|
impl Document {
|
||||||
fn from_raw(raw: &RawDocument) -> Document {
|
fn from_raw(raw: RawDocument) -> Document {
|
||||||
let len = raw.matches.range.len();
|
Document { id: raw.id, highlights: raw.highlights }
|
||||||
let mut matches = Vec::with_capacity(len);
|
|
||||||
|
|
||||||
let query_index = raw.query_index();
|
|
||||||
let distance = raw.distance();
|
|
||||||
let attribute = raw.attribute();
|
|
||||||
let word_index = raw.word_index();
|
|
||||||
let is_exact = raw.is_exact();
|
|
||||||
let char_index = raw.char_index();
|
|
||||||
let char_length = raw.char_length();
|
|
||||||
|
|
||||||
for i in 0..len {
|
|
||||||
let match_ = Match {
|
|
||||||
query_index: query_index[i],
|
|
||||||
distance: distance[i],
|
|
||||||
attribute: attribute[i],
|
|
||||||
word_index: word_index[i],
|
|
||||||
is_exact: is_exact[i],
|
|
||||||
char_index: char_index[i],
|
|
||||||
char_length: char_length[i],
|
|
||||||
};
|
|
||||||
matches.push(match_);
|
|
||||||
}
|
|
||||||
|
|
||||||
Document { id: raw.id, matches }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,11 +104,12 @@ impl Document {
|
|||||||
pub struct RawDocument {
|
pub struct RawDocument {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: SharedMatches,
|
pub matches: SharedMatches,
|
||||||
|
pub highlights: Vec<Highlight>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawDocument {
|
impl RawDocument {
|
||||||
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
||||||
RawDocument { id, matches: SharedMatches { range, matches } }
|
RawDocument { id, matches, highlights }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_index(&self) -> &[u32] {
|
pub fn query_index(&self) -> &[u32] {
|
||||||
@ -199,20 +146,6 @@ impl RawDocument {
|
|||||||
// can only be done in this module
|
// can only be done in this module
|
||||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn char_index(&self) -> &[u16] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn char_length(&self) -> &[u16] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for RawDocument {
|
impl fmt::Debug for RawDocument {
|
||||||
@ -224,27 +157,30 @@ impl fmt::Debug for RawDocument {
|
|||||||
.field("attribute", &self.attribute())
|
.field("attribute", &self.attribute())
|
||||||
.field("word_index", &self.word_index())
|
.field("word_index", &self.word_index())
|
||||||
.field("is_exact", &self.is_exact())
|
.field("is_exact", &self.is_exact())
|
||||||
.field("char_index", &self.char_index())
|
|
||||||
.field("char_length", &self.char_length())
|
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec<RawDocument> {
|
fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec<RawDocument> {
|
||||||
let mut docs_ranges = Vec::<(_, Range)>::new();
|
let mut docs_ranges = Vec::<(DocumentId, Range, Vec<Highlight>)>::new();
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) {
|
||||||
let id = group[0].0;
|
let document_id = group[0].0;
|
||||||
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
||||||
let end = start + group.len();
|
let end = start + group.len();
|
||||||
docs_ranges.push((id, Range { start, end }));
|
|
||||||
|
let highlights = group.iter().map(|(_, _, h)| *h).collect();
|
||||||
|
docs_ranges.push((document_id, Range { start, end }, highlights));
|
||||||
|
|
||||||
matches2.extend_from_slice(group);
|
matches2.extend_from_slice(group);
|
||||||
}
|
}
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
let matches = Arc::new(matches2);
|
||||||
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
docs_ranges.into_iter().map(|(i, range, highlights)| {
|
||||||
|
let matches = SharedMatches { range, matches: matches.clone() };
|
||||||
|
RawDocument::new(i, matches, highlights)
|
||||||
|
}).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
@ -253,12 +189,6 @@ struct Range {
|
|||||||
end: usize,
|
end: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Range {
|
|
||||||
fn len(self) -> usize {
|
|
||||||
self.end - self.start
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SharedMatches {
|
pub struct SharedMatches {
|
||||||
range: Range,
|
range: Range,
|
||||||
@ -272,8 +202,6 @@ struct Matches {
|
|||||||
attribute: Vec<u16>,
|
attribute: Vec<u16>,
|
||||||
word_index: Vec<u16>,
|
word_index: Vec<u16>,
|
||||||
is_exact: Vec<bool>,
|
is_exact: Vec<bool>,
|
||||||
char_index: Vec<u16>,
|
|
||||||
char_length: Vec<u16>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Matches {
|
impl Matches {
|
||||||
@ -284,25 +212,20 @@ impl Matches {
|
|||||||
attribute: Vec::with_capacity(cap),
|
attribute: Vec::with_capacity(cap),
|
||||||
word_index: Vec::with_capacity(cap),
|
word_index: Vec::with_capacity(cap),
|
||||||
is_exact: Vec::with_capacity(cap),
|
is_exact: Vec::with_capacity(cap),
|
||||||
char_index: Vec::with_capacity(cap),
|
|
||||||
char_length: Vec::with_capacity(cap),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) {
|
||||||
for (_, match_) in matches {
|
for (_, match_, _) in matches {
|
||||||
self.query_index.push(match_.query_index);
|
self.query_index.push(match_.query_index);
|
||||||
self.distance.push(match_.distance);
|
self.distance.push(match_.distance);
|
||||||
self.attribute.push(match_.attribute);
|
self.attribute.push(match_.attribute);
|
||||||
self.word_index.push(match_.word_index);
|
self.word_index.push(match_.word_index);
|
||||||
self.is_exact.push(match_.is_exact);
|
self.is_exact.push(match_.is_exact);
|
||||||
self.char_index.push(match_.char_index);
|
|
||||||
self.char_length.push(match_.char_length);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -5,7 +5,7 @@ use std::time::Instant;
|
|||||||
use std::{cmp, mem};
|
use std::{cmp, mem};
|
||||||
|
|
||||||
use fst::{Streamer, IntoStreamer};
|
use fst::{Streamer, IntoStreamer};
|
||||||
use hashbrown::{HashMap, HashSet};
|
use hashbrown::HashMap;
|
||||||
use log::info;
|
use log::info;
|
||||||
use meilidb_tokenizer::{is_cjk, split_query_string};
|
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||||
use rayon::slice::ParallelSliceMut;
|
use rayon::slice::ParallelSliceMut;
|
||||||
@ -18,7 +18,7 @@ use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
|||||||
use crate::criterion::Criteria;
|
use crate::criterion::Criteria;
|
||||||
use crate::raw_documents_from_matches;
|
use crate::raw_documents_from_matches;
|
||||||
use crate::reordered_attrs::ReorderedAttrs;
|
use crate::reordered_attrs::ReorderedAttrs;
|
||||||
use crate::{Match, DocumentId, Store, RawDocument, Document};
|
use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document};
|
||||||
|
|
||||||
const NGRAMS: usize = 3;
|
const NGRAMS: usize = 3;
|
||||||
|
|
||||||
@ -178,12 +178,12 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
Ok(automatons)
|
Ok(automatons)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rewrite_matched_positions(matches: &mut [(DocumentId, Match)]) {
|
fn rewrite_matched_positions(matches: &mut [(DocumentId, TmpMatch, Highlight)]) {
|
||||||
for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
|
for document_matches in matches.linear_group_by_mut(|(a, _, _), (b, _, _)| a == b) {
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
|
for query_indexes in document_matches.linear_group_by_mut(|(_, a, _), (_, b, _)| a.query_index == b.query_index) {
|
||||||
let word_index = query_indexes[0].1.word_index - offset as u16;
|
let word_index = query_indexes[0].1.word_index - offset as u16;
|
||||||
for (_, match_) in query_indexes.iter_mut() {
|
for (_, match_, _) in query_indexes.iter_mut() {
|
||||||
match_.word_index = word_index;
|
match_.word_index = word_index;
|
||||||
}
|
}
|
||||||
offset += query_indexes.len() - 1;
|
offset += query_indexes.len() - 1;
|
||||||
@ -268,17 +268,19 @@ where S: Store,
|
|||||||
for di in doc_indexes.as_slice() {
|
for di in doc_indexes.as_slice() {
|
||||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||||
if let Some(attribute) = attribute {
|
if let Some(attribute) = attribute {
|
||||||
let match_ = Match {
|
let match_ = TmpMatch {
|
||||||
query_index: query_index as u32,
|
query_index: query_index as u32,
|
||||||
distance,
|
distance,
|
||||||
attribute,
|
attribute,
|
||||||
word_index: di.word_index,
|
word_index: di.word_index,
|
||||||
is_exact,
|
is_exact,
|
||||||
|
};
|
||||||
|
let highlight = Highlight {
|
||||||
|
attribute: di.attribute,
|
||||||
char_index: di.char_index,
|
char_index: di.char_index,
|
||||||
char_length: di.char_length,
|
char_length: di.char_length,
|
||||||
};
|
};
|
||||||
matches.push((di.document_id, match_));
|
matches.push((di.document_id, match_, highlight));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -289,7 +291,11 @@ where S: Store,
|
|||||||
rewrite_matched_positions(&mut matches);
|
rewrite_matched_positions(&mut matches);
|
||||||
|
|
||||||
let total_matches = matches.len();
|
let total_matches = matches.len();
|
||||||
let padded_matches = SetBuf::from_dirty(matches);
|
let padded_matches = {
|
||||||
|
matches.par_sort_unstable();
|
||||||
|
matches.dedup();
|
||||||
|
SetBuf::new_unchecked(matches)
|
||||||
|
};
|
||||||
let raw_documents = raw_documents_from_matches(padded_matches);
|
let raw_documents = raw_documents_from_matches(padded_matches);
|
||||||
|
|
||||||
info!("{} total documents to classify", raw_documents.len());
|
info!("{} total documents to classify", raw_documents.len());
|
||||||
@ -349,7 +355,7 @@ where S: Store,
|
|||||||
|
|
||||||
let offset = cmp::min(documents.len(), range.start);
|
let offset = cmp::min(documents.len(), range.start);
|
||||||
let iter = documents.into_iter().skip(offset).take(range.len());
|
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||||
Ok(iter.map(|d| Document::from_raw(&d)).collect())
|
Ok(iter.map(|d| Document::from_raw(d)).collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -476,7 +482,7 @@ where S: Store,
|
|||||||
};
|
};
|
||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
if distinct_accepted && seen.len() > range.start {
|
||||||
out_documents.push(Document::from_raw(&document));
|
out_documents.push(Document::from_raw(document));
|
||||||
if out_documents.len() == range.len() { break }
|
if out_documents.len() == range.len() { break }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user