mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
parent
3056b351fa
commit
14790eeae3
44 changed files with 1343 additions and 252 deletions
21
meilidb-core/Cargo.toml
Normal file
21
meilidb-core/Cargo.toml
Normal file
|
@ -0,0 +1,21 @@
|
|||
[package]
|
||||
name = "meilidb-core"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.3.1"
|
||||
fst = "0.3.3"
|
||||
hashbrown = "0.1.8"
|
||||
lazy_static = "1.2.0"
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
log = "0.4.6"
|
||||
rayon = "1.0.3"
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
slice-group-by = "0.2.4"
|
||||
|
||||
[features]
|
||||
i128 = ["byteorder/i128"]
|
91
meilidb-core/src/automaton.rs
Normal file
91
meilidb-core/src/automaton.rs
Normal file
|
@ -0,0 +1,91 @@
|
|||
use fst::Automaton;
|
||||
use lazy_static::lazy_static;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA, Distance,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
|
||||
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
|
||||
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||
}
|
||||
|
||||
pub struct DfaExt {
|
||||
query_len: usize,
|
||||
automaton: DFA,
|
||||
}
|
||||
|
||||
impl Automaton for DfaExt {
|
||||
type State = <DFA as Automaton>::State;
|
||||
|
||||
fn start(&self) -> Self::State {
|
||||
self.automaton.start()
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &Self::State) -> bool {
|
||||
self.automaton.is_match(state)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &Self::State) -> bool {
|
||||
self.automaton.can_match(state)
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &Self::State) -> bool {
|
||||
self.automaton.will_always_match(state)
|
||||
}
|
||||
|
||||
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
|
||||
self.automaton.accept(state, byte)
|
||||
}
|
||||
}
|
||||
|
||||
impl AutomatonExt for DfaExt {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
|
||||
self.automaton.eval(s)
|
||||
}
|
||||
|
||||
fn query_len(&self) -> usize {
|
||||
self.query_len
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
|
||||
use self::PrefixSetting::{Prefix, NoPrefix};
|
||||
|
||||
let dfa = match query.len() {
|
||||
0 ..= 4 => match setting {
|
||||
Prefix => LEVDIST0.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST0.build_dfa(query),
|
||||
},
|
||||
5 ..= 8 => match setting {
|
||||
Prefix => LEVDIST1.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST1.build_dfa(query),
|
||||
},
|
||||
_ => match setting {
|
||||
Prefix => LEVDIST2.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST2.build_dfa(query),
|
||||
},
|
||||
};
|
||||
|
||||
DfaExt { query_len: query.len(), automaton: dfa }
|
||||
}
|
||||
|
||||
pub fn build_prefix_dfa(query: &str) -> DfaExt {
|
||||
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
||||
}
|
||||
|
||||
pub fn build_dfa(query: &str) -> DfaExt {
|
||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||
}
|
||||
|
||||
pub trait AutomatonExt: Automaton {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
|
||||
fn query_len(&self) -> usize;
|
||||
}
|
12
meilidb-core/src/criterion/document_id.rs
Normal file
12
meilidb-core/src/criterion/document_id.rs
Normal file
|
@ -0,0 +1,12 @@
|
|||
use std::cmp::Ordering;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
}
|
39
meilidb-core/src/criterion/exact.rs
Normal file
39
meilidb-core/src/criterion/exact.rs
Normal file
|
@ -0,0 +1,39 @@
|
|||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
let len = group.len();
|
||||
count += is_exact[index..index + len].contains(&true) as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
112
meilidb-core/src/criterion/mod.rs
Normal file
112
meilidb-core/src/criterion/mod.rs
Normal file
|
@ -0,0 +1,112 @@
|
|||
mod sum_of_typos;
|
||||
mod number_of_words;
|
||||
mod words_proximity;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
// mod sort_by_attr;
|
||||
mod document_id;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use crate::RawDocument;
|
||||
|
||||
pub use self::{
|
||||
sum_of_typos::SumOfTypos,
|
||||
number_of_words::NumberOfWords,
|
||||
words_proximity::WordsProximity,
|
||||
sum_of_words_attribute::SumOfWordsAttribute,
|
||||
sum_of_words_position::SumOfWordsPosition,
|
||||
exact::Exact,
|
||||
// sort_by_attr::SortByAttr,
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>
|
||||
}
|
||||
|
||||
impl<'a> CriteriaBuilder<'a>
|
||||
{
|
||||
pub fn new() -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
||||
}
|
||||
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
self.inner.reserve(additional)
|
||||
}
|
||||
|
||||
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||
where C: Criterion,
|
||||
{
|
||||
self.push(criterion);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||
where C: Criterion,
|
||||
{
|
||||
self.inner.push(Box::new(criterion));
|
||||
}
|
||||
|
||||
pub fn build(self) -> Criteria<'a> {
|
||||
Criteria { inner: self.inner }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Criteria<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> Default for Criteria<'a> {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
.add(NumberOfWords)
|
||||
.add(WordsProximity)
|
||||
.add(SumOfWordsAttribute)
|
||||
.add(SumOfWordsPosition)
|
||||
.add(Exact)
|
||||
.add(DocumentId)
|
||||
.build()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
||||
&self.inner
|
||||
}
|
||||
}
|
27
meilidb-core/src/criterion/number_of_words.rs
Normal file
27
meilidb-core/src/criterion/number_of_words.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group().count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
122
meilidb-core/src/criterion/sort_by_attr.rs
Normal file
122
meilidb-core/src/criterion/sort_by_attr.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::criterion::Criterion;
|
||||
use crate::database::RankedMap;
|
||||
use crate::RawDocument;
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||
///
|
||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||
/// so you must check the [`Ord`] of `Option` implementation.
|
||||
///
|
||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilidb::rank::criterion::*;
|
||||
///
|
||||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
pub struct SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
attr: SchemaAttr,
|
||||
reversed: bool,
|
||||
}
|
||||
|
||||
impl<'a> SortByAttr<'a> {
|
||||
pub fn lower_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||
}
|
||||
|
||||
pub fn higher_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||
}
|
||||
|
||||
fn new(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
reversed: bool,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
let attr = match schema.attribute(attr_name) {
|
||||
Some(attr) => attr,
|
||||
None => return Err(SortByAttrError::AttributeNotFound),
|
||||
};
|
||||
|
||||
if !schema.props(attr).is_ranked() {
|
||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||
}
|
||||
|
||||
Ok(SortByAttr { ranked_map, attr, reversed })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
|
||||
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
|
||||
|
||||
match (lhs, rhs) {
|
||||
(Some(lhs), Some(rhs)) => {
|
||||
let order = lhs.cmp(&rhs);
|
||||
if self.reversed { order.reverse() } else { order }
|
||||
},
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SortByAttrError {
|
||||
AttributeNotFound,
|
||||
AttributeNotRegisteredForRanking,
|
||||
}
|
||||
|
||||
impl fmt::Display for SortByAttrError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use SortByAttrError::*;
|
||||
match self {
|
||||
AttributeNotFound => f.write_str("attribute not found in the schema"),
|
||||
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SortByAttrError { }
|
112
meilidb-core/src/criterion/sum_of_typos.rs
Normal file
112
meilidb-core/src/criterion/sum_of_typos.rs
Normal file
|
@ -0,0 +1,112 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_typos += custom_log10(distance[index]);
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
38
meilidb-core/src/criterion/sum_of_words_attribute.rs
Normal file
38
meilidb-core/src/criterion/sum_of_words_attribute.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
38
meilidb-core/src/criterion/sum_of_words_position.rs
Normal file
38
meilidb-core/src/criterion/sum_of_words_position.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
151
meilidb-core/src/criterion/words_proximity.rs
Normal file
151
meilidb-core/src/criterion/words_proximity.rs
Normal file
|
@ -0,0 +1,151 @@
|
|||
use std::cmp::{self, Ordering};
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||
if lattr != rattr { return MAX_DISTANCE }
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(
|
||||
query_index: &[u32],
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u16],
|
||||
) -> u16
|
||||
{
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut index = 0;
|
||||
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
|
||||
(rattr, rwi)
|
||||
};
|
||||
|
||||
let mut last = query_index_groups.next().map(|group| {
|
||||
let attr_wi = get_attr_wi(index, group.len());
|
||||
index += group.len();
|
||||
attr_wi
|
||||
});
|
||||
|
||||
// iter by windows of size 2
|
||||
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||
let attr_wi = get_attr_wi(index, rhs.len());
|
||||
proximity += min_proximity(lhs, attr_wi);
|
||||
last = Some(attr_wi);
|
||||
index += rhs.len();
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 0 }
|
||||
// { id: 2, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 0, attr: 1, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 1, attr_index: 2 }
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
|
||||
}
|
||||
}
|
61
meilidb-core/src/data/doc_ids.rs
Normal file
61
meilidb-core/src/data/doc_ids.rs
Normal file
|
@ -0,0 +1,61 @@
|
|||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocumentId;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIds {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let data = cursor.extract(len);
|
||||
|
||||
Ok(DocIds(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIds {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
}
|
231
meilidb-core/src/data/doc_indexes.rs
Normal file
231
meilidb-core/src/data/doc_indexes.rs
Normal file
|
@ -0,0 +1,231 @@
|
|||
use std::io::{self, Write};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
start: u64,
|
||||
end: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
let slice = &self.indexes()[start..end];
|
||||
Set::new_unchecked(slice)
|
||||
})
|
||||
}
|
||||
|
||||
fn ranges(&self) -> &[Range] {
|
||||
let slice = &self.ranges;
|
||||
let ptr = slice.as_ptr() as *const Range;
|
||||
let len = slice.len() / size_of::<Range>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
|
||||
fn indexes(&self) -> &[DocIndex] {
|
||||
let slice = &self.indexes;
|
||||
let ptr = slice.as_ptr() as *const DocIndex;
|
||||
let len = slice.len() / size_of::<DocIndex>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Index<usize> for DocIndexes {
|
||||
type Output = [DocIndex];
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
match self.get(index) {
|
||||
Some(indexes) => indexes,
|
||||
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIndexes {
|
||||
type Error = io::Error;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let ranges = cursor.extract(len);
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let indexes = cursor.extract(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIndexes {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::error::Error;
|
||||
use crate::DocumentId;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
|
||||
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
|
||||
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
|
||||
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
|
||||
assert_eq!(docs.get(3), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
16
meilidb-core/src/data/mod.rs
Normal file
16
meilidb-core/src/data/mod.rs
Normal file
|
@ -0,0 +1,16 @@
|
|||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
mod shared_data;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
pub use self::shared_data::SharedData;
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
48
meilidb-core/src/data/shared_data.rs
Normal file
48
meilidb-core/src/data/shared_data.rs
Normal file
|
@ -0,0 +1,48 @@
|
|||
use std::sync::Arc;
|
||||
use std::ops::Deref;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::from(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
104
meilidb-core/src/distinct_map.rs
Normal file
104
meilidb-core/src/distinct_map.rs
Normal file
|
@ -0,0 +1,104 @@
|
|||
use std::hash::Hash;
|
||||
|
||||
use hashbrown::HashMap;
|
||||
|
||||
pub struct DistinctMap<K> {
|
||||
inner: HashMap<K, usize>,
|
||||
limit: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq> DistinctMap<K> {
|
||||
pub fn new(limit: usize) -> Self {
|
||||
DistinctMap {
|
||||
inner: HashMap::new(),
|
||||
limit: limit,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BufferedDistinctMap<'a, K> {
|
||||
internal: &'a mut DistinctMap<K>,
|
||||
inner: HashMap<K, usize>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
|
||||
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
|
||||
BufferedDistinctMap {
|
||||
internal: internal,
|
||||
inner: HashMap::new(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&mut self, key: K) -> bool {
|
||||
let internal_seen = self.internal.inner.get(&key).unwrap_or(&0);
|
||||
let inner_seen = self.inner.entry(key).or_insert(0);
|
||||
let seen = *internal_seen + *inner_seen;
|
||||
|
||||
if seen < self.internal.limit {
|
||||
*inner_seen += 1;
|
||||
self.len += 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register_without_key(&mut self) -> bool {
|
||||
self.len += 1;
|
||||
true
|
||||
}
|
||||
|
||||
pub fn transfert_to_internal(&mut self) {
|
||||
for (k, v) in self.inner.drain() {
|
||||
let value = self.internal.inner.entry(k).or_insert(0);
|
||||
*value += v;
|
||||
}
|
||||
|
||||
self.internal.len += self.len;
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.internal.len() + self.len
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn easy_distinct_map() {
|
||||
let mut map = DistinctMap::new(2);
|
||||
let mut buffered = BufferedDistinctMap::new(&mut map);
|
||||
|
||||
for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] {
|
||||
buffered.register(x);
|
||||
}
|
||||
buffered.transfert_to_internal();
|
||||
assert_eq!(map.len(), 8);
|
||||
|
||||
let mut map = DistinctMap::new(2);
|
||||
let mut buffered = BufferedDistinctMap::new(&mut map);
|
||||
assert_eq!(buffered.register(1), true);
|
||||
assert_eq!(buffered.register(1), true);
|
||||
assert_eq!(buffered.register(1), false);
|
||||
assert_eq!(buffered.register(1), false);
|
||||
|
||||
assert_eq!(buffered.register(2), true);
|
||||
assert_eq!(buffered.register(3), true);
|
||||
assert_eq!(buffered.register(2), true);
|
||||
assert_eq!(buffered.register(2), false);
|
||||
|
||||
buffered.transfert_to_internal();
|
||||
assert_eq!(map.len(), 5);
|
||||
}
|
||||
}
|
175
meilidb-core/src/index.rs
Normal file
175
meilidb-core/src/index.rs
Normal file
|
@ -0,0 +1,175 @@
|
|||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, IntoStreamer, Streamer};
|
||||
use fst::raw::Fst;
|
||||
use sdset::duo::{Union, DifferenceByKey};
|
||||
use sdset::{Set, SetOperation};
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
pub map: Map,
|
||||
pub indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = IndexBuilder::new();
|
||||
let mut stream = self.into_stream();
|
||||
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
buffer.clear();
|
||||
|
||||
let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Index) -> Index {
|
||||
let mut builder = IndexBuilder::new();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
while let Some((key, ivalues)) = stream.next() {
|
||||
buffer.clear();
|
||||
match ivalues {
|
||||
[a, b] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = &indexes[a.value as usize];
|
||||
let a = Set::new_unchecked(indexes);
|
||||
|
||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = &indexes[b.value as usize];
|
||||
let b = Set::new_unchecked(indexes);
|
||||
|
||||
let op = Union::new(a, b);
|
||||
op.extend_vec(&mut buffer);
|
||||
},
|
||||
[x] => {
|
||||
let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = &indexes[x.value as usize];
|
||||
buffer.extend_from_slice(indexes)
|
||||
},
|
||||
_ => continue,
|
||||
}
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for Index {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let data = cursor.extract(len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
|
||||
let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
|
||||
|
||||
Ok(Index { map, indexes})
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for Index {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Index {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
type Into = Stream<'m>;
|
||||
|
||||
fn into_stream(self) -> Self::Into {
|
||||
Stream {
|
||||
map_stream: self.map.into_stream(),
|
||||
indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Stream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let indexes = &self.indexes[index as usize];
|
||||
let indexes = Set::new_unchecked(indexes);
|
||||
Some((input, indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IndexBuilder {
|
||||
map: fst::MapBuilder<Vec<u8>>,
|
||||
indexes: DocIndexesBuilder<Vec<u8>>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl IndexBuilder {
|
||||
pub fn new() -> Self {
|
||||
IndexBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn build(self) -> Index {
|
||||
let map = self.map.into_inner().unwrap();
|
||||
let indexes = self.indexes.into_inner().unwrap();
|
||||
|
||||
let map = Map::from_bytes(map).unwrap();
|
||||
let indexes = DocIndexes::from_bytes(indexes).unwrap();
|
||||
|
||||
Index { map, indexes }
|
||||
}
|
||||
}
|
297
meilidb-core/src/lib.rs
Normal file
297
meilidb-core/src/lib.rs
Normal file
|
@ -0,0 +1,297 @@
|
|||
pub mod criterion;
|
||||
pub mod data;
|
||||
mod index;
|
||||
mod automaton;
|
||||
mod query_builder;
|
||||
mod distinct_map;
|
||||
|
||||
pub mod shared_data_cursor;
|
||||
pub mod write_to_bytes;
|
||||
|
||||
use std::sync::Arc;
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(pub u64);
|
||||
|
||||
/// This structure represent the position of a word
|
||||
/// in a document and its attributes.
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
pub document_id: DocumentId,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
/// on the location of the word in the document.
|
||||
///
|
||||
/// The order of the field is important because it defines
|
||||
/// the way these structures are ordered between themselves.
|
||||
///
|
||||
/// The word in itself is not important.
|
||||
// TODO do data oriented programming ? very arrays ?
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Match {
|
||||
/// The word index in the query sentence.
|
||||
/// Same as the `attribute_index` but for the query words.
|
||||
///
|
||||
/// Used to retrieve the automaton that match this word.
|
||||
pub query_index: u32,
|
||||
|
||||
/// The distance the word has with the query word
|
||||
/// (i.e. the Levenshtein distance).
|
||||
pub distance: u8,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub fn zero() -> Self {
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: 0,
|
||||
word_index: 0,
|
||||
is_exact: false,
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: u16::max_value(),
|
||||
word_index: u16::max_value(),
|
||||
is_exact: true,
|
||||
char_index: u16::max_value(),
|
||||
char_length: u16::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
pub matches: Vec<Match>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
fn from_raw(raw: &RawDocument) -> Document {
|
||||
let len = raw.matches.range.len();
|
||||
let mut matches = Vec::with_capacity(len);
|
||||
|
||||
let query_index = raw.query_index();
|
||||
let distance = raw.distance();
|
||||
let attribute = raw.attribute();
|
||||
let word_index = raw.word_index();
|
||||
let is_exact = raw.is_exact();
|
||||
let char_index = raw.char_index();
|
||||
let char_length = raw.char_length();
|
||||
|
||||
for i in 0..len {
|
||||
let match_ = Match {
|
||||
query_index: query_index[i],
|
||||
distance: distance[i],
|
||||
attribute: attribute[i],
|
||||
word_index: word_index[i],
|
||||
is_exact: is_exact[i],
|
||||
char_index: char_index[i],
|
||||
char_length: char_length[i],
|
||||
};
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document { id: raw.id, matches }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
||||
RawDocument { id, matches: SharedMatches { range, matches } }
|
||||
}
|
||||
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_length(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
matches.par_sort_unstable();
|
||||
|
||||
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||
let id = group[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||
let end = start + group.len();
|
||||
docs_ranges.push((id, Range { start, end }));
|
||||
|
||||
matches2.extend_from_slice(group);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
impl Range {
|
||||
fn len(self) -> usize {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u16>,
|
||||
is_exact: Vec<bool>,
|
||||
char_index: Vec<u16>,
|
||||
char_length: Vec<u16>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
char_index: Vec::with_capacity(cap),
|
||||
char_length: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
self.char_index.push(match_.char_index);
|
||||
self.char_length.push(match_.char_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||
}
|
||||
}
|
357
meilidb-core/src/query_builder.rs
Normal file
357
meilidb-core/src/query_builder.rs
Normal file
|
@ -0,0 +1,357 @@
|
|||
use std::{cmp, mem};
|
||||
use std::ops::Range;
|
||||
use std::time::Instant;
|
||||
use std::hash::Hash;
|
||||
use std::rc::Rc;
|
||||
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use slice_group_by::{GroupByMut, LinearStrGroupBy};
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use fst::Streamer;
|
||||
use log::info;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::criterion::Criteria;
|
||||
use crate::{raw_documents_from_matches, RawDocument, Document};
|
||||
use crate::{Index, Match, DocumentId};
|
||||
|
||||
// query splitting must move out of this crate
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum CharCategory {
|
||||
Space,
|
||||
Cjk,
|
||||
Other,
|
||||
}
|
||||
|
||||
fn classify_char(c: char) -> CharCategory {
|
||||
if c.is_whitespace() { CharCategory::Space }
|
||||
else if is_cjk(c) { CharCategory::Cjk }
|
||||
else { CharCategory::Other }
|
||||
}
|
||||
|
||||
fn is_word(s: &&str) -> bool {
|
||||
!s.chars().any(char::is_whitespace)
|
||||
}
|
||||
|
||||
fn same_group_category(a: char, b: char) -> bool {
|
||||
let ca = classify_char(a);
|
||||
let cb = classify_char(b);
|
||||
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
|
||||
}
|
||||
|
||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let mut groups = LinearStrGroupBy::new(query, same_group_category)
|
||||
.filter(is_word)
|
||||
.map(str::to_lowercase)
|
||||
.peekable();
|
||||
|
||||
let mut automatons = Vec::new();
|
||||
while let Some(word) = groups.next() {
|
||||
let has_following_word = groups.peek().is_some();
|
||||
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
|
||||
automaton::build_dfa(&word)
|
||||
} else {
|
||||
automaton::build_prefix_dfa(&word)
|
||||
};
|
||||
automatons.push(lev);
|
||||
}
|
||||
|
||||
automatons
|
||||
}
|
||||
|
||||
pub type FilterFunc = fn(DocumentId) -> bool;
|
||||
|
||||
pub struct QueryBuilder<'i, 'c, FI> {
|
||||
index: &'i Index,
|
||||
criteria: Criteria<'c>,
|
||||
searchable_attrs: Option<HashSet<u16>>,
|
||||
filter: Option<FI>,
|
||||
}
|
||||
|
||||
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
|
||||
pub fn new(index: &'i Index) -> Self {
|
||||
QueryBuilder::with_criteria(index, Criteria::default())
|
||||
}
|
||||
|
||||
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
|
||||
QueryBuilder { index, criteria, searchable_attrs: None, filter: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||
{
|
||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
|
||||
where F: Fn(DocumentId) -> bool,
|
||||
{
|
||||
QueryBuilder {
|
||||
index: self.index,
|
||||
criteria: self.criteria,
|
||||
searchable_attrs: self.searchable_attrs,
|
||||
filter: Some(function)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
|
||||
where F: Fn(DocumentId) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self,
|
||||
function: function,
|
||||
size: size
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_searchable_attribute(&mut self, attribute: u16) {
|
||||
let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new);
|
||||
attributes.insert(attribute);
|
||||
}
|
||||
|
||||
fn query_all(&self, query: &str) -> Vec<RawDocument> {
|
||||
let automatons = split_whitespace_automatons(query);
|
||||
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.index.map.search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
};
|
||||
|
||||
let mut matches = Vec::new();
|
||||
|
||||
while let Some((input, indexed_values)) = stream.next() {
|
||||
for iv in indexed_values {
|
||||
let automaton = &automatons[iv.index];
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = &self.index.indexes;
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
for di in doc_indexes {
|
||||
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
||||
let match_ = Match {
|
||||
query_index: iv.index as u32,
|
||||
distance: distance,
|
||||
attribute: di.attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact: is_exact,
|
||||
char_index: di.char_index,
|
||||
char_length: di.char_length,
|
||||
};
|
||||
matches.push((di.document_id, match_));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let total_matches = matches.len();
|
||||
let raw_documents = raw_documents_from_matches(matches);
|
||||
|
||||
info!("{} total documents to classify", raw_documents.len());
|
||||
info!("{} total matches to classify", total_matches);
|
||||
|
||||
raw_documents
|
||||
}
|
||||
}
|
||||
|
||||
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
{
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if self.filter.is_some() {
|
||||
let builder = self.with_distinct(|_| None as Option<()>, 1);
|
||||
return builder.query(query, range);
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let mut documents = self.query_all(query);
|
||||
info!("query_all took {:.2?}", start.elapsed());
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
|
||||
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < range.start {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
continue;
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if documents_seen >= range.end { continue 'criteria }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let offset = cmp::min(documents.len(), range.start);
|
||||
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||
iter.map(|d| Document::from_raw(&d)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
|
||||
inner: QueryBuilder<'i, 'c, FI>,
|
||||
function: FD,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||
{
|
||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
|
||||
where F: Fn(DocumentId) -> bool,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self.inner.with_filter(function),
|
||||
function: self.function,
|
||||
size: self.size
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_searchable_attribute(&mut self, attribute: u16) {
|
||||
self.inner.add_searchable_attribute(attribute);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let start = Instant::now();
|
||||
let mut documents = self.inner.query_all(query);
|
||||
info!("query_all took {:.2?}", start.elapsed());
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
// on the raw offset of the start of the group where the
|
||||
// range.start bound is located according to the distinct function
|
||||
let mut distinct_map = DistinctMap::new(self.size);
|
||||
let mut distinct_raw_offset = 0;
|
||||
|
||||
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < distinct_raw_offset {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
continue;
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id))
|
||||
},
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
};
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end { break }
|
||||
}
|
||||
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// if this sub-group does not overlap with the requested range
|
||||
// we must update the distinct map and its start index
|
||||
if buf_distinct.len() < range.start {
|
||||
buf_distinct.transfert_to_internal();
|
||||
distinct_raw_offset = documents_seen;
|
||||
}
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if buf_distinct.len() >= range.end { continue 'criteria }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut out_documents = Vec::with_capacity(range.len());
|
||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
||||
|
||||
for document in documents.into_iter().skip(distinct_raw_offset) {
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
|
||||
let distinct_accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
out_documents.push(Document::from_raw(&document));
|
||||
if out_documents.len() == range.len() { break }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out_documents
|
||||
}
|
||||
}
|
56
meilidb-core/src/shared_data_cursor.rs
Normal file
56
meilidb-core/src/shared_data_cursor.rs
Normal file
|
@ -0,0 +1,56 @@
|
|||
use std::io::{self, Read, Cursor, BufRead};
|
||||
use std::sync::Arc;
|
||||
use crate::data::SharedData;
|
||||
|
||||
pub struct SharedDataCursor(Cursor<SharedData>);
|
||||
|
||||
impl SharedDataCursor {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes);
|
||||
|
||||
SharedDataCursor::from_shared_bytes(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let cursor = Cursor::new(data);
|
||||
|
||||
SharedDataCursor(cursor)
|
||||
}
|
||||
|
||||
pub fn extract(&mut self, amt: usize) -> SharedData {
|
||||
let offset = self.0.position() as usize;
|
||||
let extracted = self.0.get_ref().range(offset, amt);
|
||||
self.0.consume(amt);
|
||||
|
||||
extracted
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for SharedDataCursor {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.0.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl BufRead for SharedDataCursor {
|
||||
fn fill_buf(&mut self) -> io::Result<&[u8]> {
|
||||
self.0.fill_buf()
|
||||
}
|
||||
|
||||
fn consume(&mut self, amt: usize) {
|
||||
self.0.consume(amt)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait FromSharedDataCursor: Sized {
|
||||
type Error;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
|
||||
|
||||
fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
|
||||
let mut cursor = SharedDataCursor::from_bytes(bytes);
|
||||
Self::from_shared_data_cursor(&mut cursor)
|
||||
}
|
||||
}
|
9
meilidb-core/src/write_to_bytes.rs
Normal file
9
meilidb-core/src/write_to_bytes.rs
Normal file
|
@ -0,0 +1,9 @@
|
|||
pub trait WriteToBytes {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>);
|
||||
|
||||
fn into_bytes(&self) -> Vec<u8> {
|
||||
let mut bytes = Vec::new();
|
||||
self.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue