chore: Move index related things to the meilidb-core workspace member

This commit is contained in:
Clément Renault 2019-02-24 19:44:24 +01:00
parent 3056b351fa
commit 14790eeae3
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
44 changed files with 1343 additions and 252 deletions

21
meilidb-core/Cargo.toml Normal file
View file

@ -0,0 +1,21 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = "0.1.8"
lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.6"
rayon = "1.0.3"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
slice-group-by = "0.2.4"
[features]
i128 = ["byteorder/i128"]

View file

@ -0,0 +1,91 @@
use fst::Automaton;
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA, Distance,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
pub struct DfaExt {
query_len: usize,
automaton: DFA,
}
impl Automaton for DfaExt {
type State = <DFA as Automaton>::State;
fn start(&self) -> Self::State {
self.automaton.start()
}
fn is_match(&self, state: &Self::State) -> bool {
self.automaton.is_match(state)
}
fn can_match(&self, state: &Self::State) -> bool {
self.automaton.can_match(state)
}
fn will_always_match(&self, state: &Self::State) -> bool {
self.automaton.will_always_match(state)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.automaton.accept(state, byte)
}
}
impl AutomatonExt for DfaExt {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
self.automaton.eval(s)
}
fn query_len(&self) -> usize {
self.query_len
}
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
use self::PrefixSetting::{Prefix, NoPrefix};
let dfa = match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
};
DfaExt { query_len: query.len(), automaton: dfa }
}
pub fn build_prefix_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub trait AutomatonExt: Automaton {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
fn query_len(&self) -> usize;
}

View file

@ -0,0 +1,12 @@
use std::cmp::Ordering;
use crate::criterion::Criterion;
use crate::RawDocument;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
}
}

View file

@ -0,0 +1,39 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() {
let len = group.len();
count += is_exact[index..index + len].contains(&true) as usize;
index += len;
}
count
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl Criterion for Exact {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
number_exact_matches(query_index, is_exact)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
number_exact_matches(query_index, is_exact)
};
lhs.cmp(&rhs).reverse()
}
}

View file

@ -0,0 +1,112 @@
mod sum_of_typos;
mod number_of_words;
mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
// mod sort_by_attr;
mod document_id;
use std::cmp::Ordering;
use crate::RawDocument;
pub use self::{
sum_of_typos::SumOfTypos,
number_of_words::NumberOfWords,
words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
// sort_by_attr::SortByAttr,
document_id::DocumentId,
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
}
}
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
impl<T: Criterion + ?Sized> Criterion for Box<T> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>
}
impl<'a> CriteriaBuilder<'a>
{
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Exact)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}

View file

@ -0,0 +1,27 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
}

View file

@ -0,0 +1,122 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use crate::database::schema::{Schema, SchemaAttr};
use crate::criterion::Criterion;
use crate::database::RankedMap;
use crate::RawDocument;
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(custom_ranking)
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
attr: SchemaAttr,
reversed: bool,
}
impl<'a> SortByAttr<'a> {
pub fn lower_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, false)
}
pub fn higher_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, true)
}
fn new(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound),
};
if !schema.props(attr).is_ranked() {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr { ranked_map, attr, reversed })
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order }
},
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SortByAttrError {
AttributeNotFound,
AttributeNotRegisteredForRanking,
}
impl fmt::Display for SortByAttrError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use SortByAttrError::*;
match self {
AttributeNotFound => f.write_str("attribute not found in the schema"),
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
}
}
}
impl Error for SortByAttrError { }

View file

@ -0,0 +1,112 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View file

@ -0,0 +1,38 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
}

View file

@ -0,0 +1,38 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
}

View file

@ -0,0 +1,151 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE }
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16
{
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
}
}

View file

@ -0,0 +1,61 @@
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::SharedData;
use crate::DocumentId;
use super::into_u8_slice;
#[derive(Default, Clone)]
pub struct DocIds(SharedData);
impl DocIds {
pub fn new(ids: &Set<DocumentId>) -> DocIds {
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = SharedData::from_bytes(bytes.to_vec());
DocIds(data)
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}
}
impl FromSharedDataCursor for DocIds {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let data = cursor.extract(len);
Ok(DocIds(data))
}
}
impl WriteToBytes for DocIds {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.0);
}
}

View file

@ -0,0 +1,231 @@
use std::io::{self, Write};
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Index;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::SharedData;
use crate::DocIndex;
use super::into_u8_slice;
#[derive(Debug)]
#[repr(C)]
struct Range {
start: u64,
end: u64,
}
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: SharedData,
indexes: SharedData,
}
impl DocIndexes {
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
let slice = &self.indexes()[start..end];
Set::new_unchecked(slice)
})
}
fn ranges(&self) -> &[Range] {
let slice = &self.ranges;
let ptr = slice.as_ptr() as *const Range;
let len = slice.len() / size_of::<Range>();
unsafe { from_raw_parts(ptr, len) }
}
fn indexes(&self) -> &[DocIndex] {
let slice = &self.indexes;
let ptr = slice.as_ptr() as *const DocIndex;
let len = slice.len() / size_of::<DocIndex>();
unsafe { from_raw_parts(ptr, len) }
}
}
impl Index<usize> for DocIndexes {
type Output = [DocIndex];
fn index(&self, index: usize) -> &Self::Output {
match self.get(index) {
Some(indexes) => indexes,
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
}
}
}
impl FromSharedDataCursor for DocIndexes {
type Error = io::Error;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let ranges = cursor.extract(len);
let len = cursor.read_u64::<LittleEndian>()? as usize;
let indexes = cursor.extract(len);
Ok(DocIndexes { ranges, indexes })
}
}
impl WriteToBytes for DocIndexes {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let ranges_len = self.ranges.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.ranges);
let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
}
}
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
}
}
impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
self.indexes.extend_from_slice(indexes);
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
let ranges = unsafe { into_u8_slice(&self.ranges) };
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr)
}
}
#[cfg(test)]
mod tests {
use std::error::Error;
use crate::DocumentId;
use super::*;
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
assert_eq!(docs.get(3), None);
Ok(())
}
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
assert_eq!(builder_bytes, bytes);
Ok(())
}
}

View file

@ -0,0 +1,16 @@
mod doc_ids;
mod doc_indexes;
mod shared_data;
use std::slice::from_raw_parts;
use std::mem::size_of;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub use self::shared_data::SharedData;
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}

View file

@ -0,0 +1,48 @@
use std::sync::Arc;
use std::ops::Deref;
#[derive(Default, Clone)]
pub struct SharedData {
pub bytes: Arc<Vec<u8>>,
pub offset: usize,
pub len: usize,
}
impl SharedData {
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
let len = vec.len();
let bytes = Arc::from(vec);
SharedData::new(bytes, 0, len)
}
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn as_slice(&self) -> &[u8] {
&self.bytes[self.offset..self.offset + self.len]
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
assert!(offset + len <= self.len);
SharedData {
bytes: self.bytes.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl Deref for SharedData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl AsRef<[u8]> for SharedData {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}

View file

@ -0,0 +1,104 @@
use std::hash::Hash;
use hashbrown::HashMap;
pub struct DistinctMap<K> {
inner: HashMap<K, usize>,
limit: usize,
len: usize,
}
impl<K: Hash + Eq> DistinctMap<K> {
pub fn new(limit: usize) -> Self {
DistinctMap {
inner: HashMap::new(),
limit: limit,
len: 0,
}
}
pub fn len(&self) -> usize {
self.len
}
}
pub struct BufferedDistinctMap<'a, K> {
internal: &'a mut DistinctMap<K>,
inner: HashMap<K, usize>,
len: usize,
}
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
BufferedDistinctMap {
internal: internal,
inner: HashMap::new(),
len: 0,
}
}
pub fn register(&mut self, key: K) -> bool {
let internal_seen = self.internal.inner.get(&key).unwrap_or(&0);
let inner_seen = self.inner.entry(key).or_insert(0);
let seen = *internal_seen + *inner_seen;
if seen < self.internal.limit {
*inner_seen += 1;
self.len += 1;
true
} else {
false
}
}
pub fn register_without_key(&mut self) -> bool {
self.len += 1;
true
}
pub fn transfert_to_internal(&mut self) {
for (k, v) in self.inner.drain() {
let value = self.internal.inner.entry(k).or_insert(0);
*value += v;
}
self.internal.len += self.len;
self.len = 0;
}
pub fn len(&self) -> usize {
self.internal.len() + self.len
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy_distinct_map() {
let mut map = DistinctMap::new(2);
let mut buffered = BufferedDistinctMap::new(&mut map);
for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] {
buffered.register(x);
}
buffered.transfert_to_internal();
assert_eq!(map.len(), 8);
let mut map = DistinctMap::new(2);
let mut buffered = BufferedDistinctMap::new(&mut map);
assert_eq!(buffered.register(1), true);
assert_eq!(buffered.register(1), true);
assert_eq!(buffered.register(1), false);
assert_eq!(buffered.register(1), false);
assert_eq!(buffered.register(2), true);
assert_eq!(buffered.register(3), true);
assert_eq!(buffered.register(2), true);
assert_eq!(buffered.register(2), false);
buffered.transfert_to_internal();
assert_eq!(map.len(), 5);
}
}

175
meilidb-core/src/index.rs Normal file
View file

@ -0,0 +1,175 @@
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{map, Map, IntoStreamer, Streamer};
use fst::raw::Fst;
use sdset::duo::{Union, DifferenceByKey};
use sdset::{Set, SetOperation};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::{DocIndexes, DocIndexesBuilder};
use crate::{DocumentId, DocIndex};
#[derive(Default)]
pub struct Index {
pub map: Map,
pub indexes: DocIndexes,
}
impl Index {
pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
let mut buffer = Vec::new();
let mut builder = IndexBuilder::new();
let mut stream = self.into_stream();
while let Some((key, indexes)) = stream.next() {
buffer.clear();
let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes).unwrap();
}
}
builder.build()
}
pub fn union(&self, other: &Index) -> Index {
let mut builder = IndexBuilder::new();
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
let mut buffer = Vec::new();
while let Some((key, ivalues)) = stream.next() {
buffer.clear();
match ivalues {
[a, b] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[a.value as usize];
let a = Set::new_unchecked(indexes);
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[b.value as usize];
let b = Set::new_unchecked(indexes);
let op = Union::new(a, b);
op.extend_vec(&mut buffer);
},
[x] => {
let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[x.value as usize];
buffer.extend_from_slice(indexes)
},
_ => continue,
}
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes).unwrap();
}
}
builder.build()
}
}
impl FromSharedDataCursor for Index {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let data = cursor.extract(len);
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
let map = Map::from(fst);
let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
Ok(Index { map, indexes})
}
}
impl WriteToBytes for Index {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.map.as_fst().as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
self.indexes.write_to_bytes(bytes);
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m Index {
type Item = (&'a [u8], &'a Set<DocIndex>);
type Into = Stream<'m>;
fn into_stream(self) -> Self::Into {
Stream {
map_stream: self.map.into_stream(),
indexes: &self.indexes,
}
}
}
pub struct Stream<'m> {
map_stream: map::Stream<'m>,
indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for Stream<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let indexes = &self.indexes[index as usize];
let indexes = Set::new_unchecked(indexes);
Some((input, indexes))
},
None => None,
}
}
}
pub struct IndexBuilder {
map: fst::MapBuilder<Vec<u8>>,
indexes: DocIndexesBuilder<Vec<u8>>,
value: u64,
}
impl IndexBuilder {
pub fn new() -> Self {
IndexBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(indexes);
self.value += 1;
Ok(())
}
pub fn build(self) -> Index {
let map = self.map.into_inner().unwrap();
let indexes = self.indexes.into_inner().unwrap();
let map = Map::from_bytes(map).unwrap();
let indexes = DocIndexes::from_bytes(indexes).unwrap();
Index { map, indexes }
}
}

297
meilidb-core/src/lib.rs Normal file
View file

@ -0,0 +1,297 @@
pub mod criterion;
pub mod data;
mod index;
mod automaton;
mod query_builder;
mod distinct_map;
pub mod shared_data_cursor;
pub mod write_to_bytes;
use std::sync::Arc;
use serde_derive::{Serialize, Deserialize};
use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut;
pub use self::index::{Index, IndexBuilder};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u16::max_value(),
is_exact: true,
char_index: u16::max_value(),
char_length: u16::max_value(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub matches: Vec<Match>,
}
impl Document {
fn from_raw(raw: &RawDocument) -> Document {
let len = raw.matches.range.len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
let char_index = raw.char_index();
let char_length = raw.char_length();
for i in 0..len {
let match_ = Match {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
char_index: char_index[i],
char_length: char_length[i],
};
matches.push(match_);
}
Document { id: raw.id, matches }
}
}
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
}
impl RawDocument {
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
RawDocument { id, matches: SharedMatches { range, matches } }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
pub fn char_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
}
pub fn char_length(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
}
}
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
let mut matches2 = Matches::with_capacity(matches.len());
matches.par_sort_unstable();
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
let id = group[0].0;
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
let end = start + group.len();
docs_ranges.push((id, Range { start, end }));
matches2.extend_from_slice(group);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
impl Range {
fn len(self) -> usize {
self.end - self.start
}
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
char_index: Vec<u16>,
char_length: Vec<u16>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
char_index: Vec::with_capacity(cap),
char_length: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
self.char_index.push(match_.char_index);
self.char_length.push(match_.char_length);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 24);
}
}

View file

@ -0,0 +1,357 @@
use std::{cmp, mem};
use std::ops::Range;
use std::time::Instant;
use std::hash::Hash;
use std::rc::Rc;
use rayon::slice::ParallelSliceMut;
use slice_group_by::{GroupByMut, LinearStrGroupBy};
use hashbrown::{HashMap, HashSet};
use fst::Streamer;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::criterion::Criteria;
use crate::{raw_documents_from_matches, RawDocument, Document};
use crate::{Index, Match, DocumentId};
// query splitting must move out of this crate
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, PartialEq, Eq)]
enum CharCategory {
Space,
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if c.is_whitespace() { CharCategory::Space }
else if is_cjk(c) { CharCategory::Cjk }
else { CharCategory::Other }
}
fn is_word(s: &&str) -> bool {
!s.chars().any(char::is_whitespace)
}
fn same_group_category(a: char, b: char) -> bool {
let ca = classify_char(a);
let cb = classify_char(b);
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
}
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let mut groups = LinearStrGroupBy::new(query, same_group_category)
.filter(is_word)
.map(str::to_lowercase)
.peekable();
let mut automatons = Vec::new();
while let Some(word) = groups.next() {
let has_following_word = groups.peek().is_some();
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
automaton::build_dfa(&word)
} else {
automaton::build_prefix_dfa(&word)
};
automatons.push(lev);
}
automatons
}
pub type FilterFunc = fn(DocumentId) -> bool;
pub struct QueryBuilder<'i, 'c, FI> {
index: &'i Index,
criteria: Criteria<'c>,
searchable_attrs: Option<HashSet<u16>>,
filter: Option<FI>,
}
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
pub fn new(index: &'i Index) -> Self {
QueryBuilder::with_criteria(index, Criteria::default())
}
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
QueryBuilder { index, criteria, searchable_attrs: None, filter: None }
}
}
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
{
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
where F: Fn(DocumentId) -> bool,
{
QueryBuilder {
index: self.index,
criteria: self.criteria,
searchable_attrs: self.searchable_attrs,
filter: Some(function)
}
}
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
where F: Fn(DocumentId) -> Option<K>,
K: Hash + Eq,
{
DistinctQueryBuilder {
inner: self,
function: function,
size: size
}
}
pub fn add_searchable_attribute(&mut self, attribute: u16) {
let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new);
attributes.insert(attribute);
}
fn query_all(&self, query: &str) -> Vec<RawDocument> {
let automatons = split_whitespace_automatons(query);
let mut stream = {
let mut op_builder = fst::map::OpBuilder::new();
for automaton in &automatons {
let stream = self.index.map.search(automaton);
op_builder.push(stream);
}
op_builder.union()
};
let mut matches = Vec::new();
while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values {
let automaton = &automatons[iv.index];
let distance = automaton.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len();
let doc_indexes = &self.index.indexes;
let doc_indexes = &doc_indexes[iv.value as usize];
for di in doc_indexes {
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: is_exact,
char_index: di.char_index,
char_length: di.char_length,
};
matches.push((di.document_id, match_));
}
}
}
}
let total_matches = matches.len();
let raw_documents = raw_documents_from_matches(matches);
info!("{} total documents to classify", raw_documents.len());
info!("{} total matches to classify", total_matches);
raw_documents
}
}
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
where FI: Fn(DocumentId) -> bool,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if self.filter.is_some() {
let builder = self.with_distinct(|_| None as Option<()>, 1);
return builder.query(query, range);
}
let start = Instant::now();
let mut documents = self.query_all(query);
info!("query_all took {:.2?}", start.elapsed());
let mut groups = vec![documents.as_mut_slice()];
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < range.start {
documents_seen += group.len();
groups.push(group);
continue;
}
let start = Instant::now();
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end { continue 'criteria }
}
}
}
let offset = cmp::min(documents.len(), range.start);
let iter = documents.into_iter().skip(offset).take(range.len());
iter.map(|d| Document::from_raw(&d)).collect()
}
}
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
inner: QueryBuilder<'i, 'c, FI>,
function: FD,
size: usize,
}
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
{
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
where F: Fn(DocumentId) -> bool,
{
DistinctQueryBuilder {
inner: self.inner.with_filter(function),
function: self.function,
size: self.size
}
}
pub fn add_searchable_attribute(&mut self, attribute: u16) {
self.inner.add_searchable_attribute(attribute);
}
}
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
where FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<K>,
K: Hash + Eq,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let start = Instant::now();
let mut documents = self.inner.query_all(query);
info!("query_all took {:.2?}", start.elapsed());
let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(self.size);
let mut distinct_raw_offset = 0;
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
let start = Instant::now();
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &self.inner.filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
},
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break }
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end { continue 'criteria }
}
}
}
let mut out_documents = Vec::with_capacity(range.len());
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
for document in documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &self.inner.filter {
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
out_documents.push(Document::from_raw(&document));
if out_documents.len() == range.len() { break }
}
}
}
out_documents
}
}

View file

@ -0,0 +1,56 @@
use std::io::{self, Read, Cursor, BufRead};
use std::sync::Arc;
use crate::data::SharedData;
pub struct SharedDataCursor(Cursor<SharedData>);
impl SharedDataCursor {
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
let len = bytes.len();
let bytes = Arc::new(bytes);
SharedDataCursor::from_shared_bytes(bytes, 0, len)
}
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
let data = SharedData::new(bytes, offset, len);
let cursor = Cursor::new(data);
SharedDataCursor(cursor)
}
pub fn extract(&mut self, amt: usize) -> SharedData {
let offset = self.0.position() as usize;
let extracted = self.0.get_ref().range(offset, amt);
self.0.consume(amt);
extracted
}
}
impl Read for SharedDataCursor {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.0.read(buf)
}
}
impl BufRead for SharedDataCursor {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
self.0.fill_buf()
}
fn consume(&mut self, amt: usize) {
self.0.consume(amt)
}
}
pub trait FromSharedDataCursor: Sized {
type Error;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
let mut cursor = SharedDataCursor::from_bytes(bytes);
Self::from_shared_data_cursor(&mut cursor)
}
}

View file

@ -0,0 +1,9 @@
pub trait WriteToBytes {
fn write_to_bytes(&self, bytes: &mut Vec<u8>);
fn into_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::new();
self.write_to_bytes(&mut bytes);
bytes
}
}