mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-02-08 19:43:28 +01:00
feat: Make the search algorithm become fully data oriented
This commit is contained in:
parent
a3a28c56fa
commit
455cbf3bf4
@ -21,6 +21,7 @@ serde_derive = "1.0"
|
|||||||
serde_json = { version = "1.0", features = ["preserve_order"] }
|
serde_json = { version = "1.0", features = ["preserve_order"] }
|
||||||
slice-group-by = "0.2"
|
slice-group-by = "0.2"
|
||||||
unidecode = "0.3"
|
unidecode = "0.3"
|
||||||
|
rayon = "1.0"
|
||||||
|
|
||||||
[dependencies.toml]
|
[dependencies.toml]
|
||||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||||
|
@ -1,33 +1,40 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains_exact(matches: &&[Match]) -> bool {
|
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
||||||
matches.iter().any(|m| m.is_exact)
|
let mut count = 0;
|
||||||
}
|
let mut index = 0;
|
||||||
|
|
||||||
#[inline]
|
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
let len = group.len();
|
||||||
matches.linear_group_by(match_query_index).filter(contains_exact).count()
|
count += is_exact[index..index + len].contains(&true) as usize;
|
||||||
|
index += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
count
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct Exact;
|
pub struct Exact;
|
||||||
|
|
||||||
impl<D> Criterion<D> for Exact
|
impl Criterion for Exact {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = number_exact_matches(&lhs.matches);
|
let is_exact = lhs.is_exact();
|
||||||
let rhs = number_exact_matches(&rhs.matches);
|
number_exact_matches(query_index, is_exact)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let is_exact = rhs.is_exact();
|
||||||
|
number_exact_matches(query_index, is_exact)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs).reverse()
|
lhs.cmp(&rhs).reverse()
|
||||||
}
|
}
|
||||||
|
@ -4,16 +4,13 @@ mod words_proximity;
|
|||||||
mod sum_of_words_attribute;
|
mod sum_of_words_attribute;
|
||||||
mod sum_of_words_position;
|
mod sum_of_words_position;
|
||||||
mod exact;
|
mod exact;
|
||||||
mod sort_by;
|
// mod sort_by;
|
||||||
mod document_id;
|
mod document_id;
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
use crate::database::DatabaseView;
|
||||||
use crate::rank::Document;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
pub use self::{
|
pub use self::{
|
||||||
sum_of_typos::SumOfTypos,
|
sum_of_typos::SumOfTypos,
|
||||||
|
@ -1,28 +1,28 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||||
matches.linear_group_by(match_query_index).count()
|
query_index.linear_group_by(PartialEq::eq).count()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct NumberOfWords;
|
pub struct NumberOfWords;
|
||||||
|
|
||||||
impl<D> Criterion<D> for NumberOfWords
|
impl Criterion for NumberOfWords {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = number_of_query_words(&lhs.matches);
|
number_of_query_words(query_index)
|
||||||
let rhs = number_of_query_words(&rhs.matches);
|
};
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
number_of_query_words(query_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs).reverse()
|
lhs.cmp(&rhs).reverse()
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ use serde::de::DeserializeOwned;
|
|||||||
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::database::DatabaseView;
|
||||||
use crate::rank::Document;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
/// An helper struct that permit to sort documents by
|
/// An helper struct that permit to sort documents by
|
||||||
/// some of their stored attributes.
|
/// some of their stored attributes.
|
||||||
|
@ -1,24 +1,20 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_typos(matches: &[Match]) -> isize {
|
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize {
|
||||||
let mut sum_typos = 0;
|
let mut sum_typos = 0;
|
||||||
let mut number_words = 0;
|
let mut number_words = 0;
|
||||||
|
let mut index = 0;
|
||||||
|
|
||||||
// note that GroupBy will never return an empty group
|
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||||
// so we can do this assumption safely
|
sum_typos += distance[index] as isize;
|
||||||
for group in matches.linear_group_by(match_query_index) {
|
|
||||||
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
|
|
||||||
number_words += 1;
|
number_words += 1;
|
||||||
|
index += group.len();
|
||||||
}
|
}
|
||||||
|
|
||||||
sum_typos - number_words
|
sum_typos - number_words
|
||||||
@ -27,78 +23,42 @@ fn sum_matches_typos(matches: &[Match]) -> isize {
|
|||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfTypos;
|
pub struct SumOfTypos;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfTypos
|
impl Criterion for SumOfTypos {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_typos(&lhs.matches);
|
let distance = lhs.distance();
|
||||||
let rhs = sum_matches_typos(&rhs.matches);
|
sum_matches_typos(query_index, distance)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let distance = rhs.distance();
|
||||||
|
sum_matches_typos(query_index, distance)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
use crate::{DocumentId, Attribute, WordArea};
|
|
||||||
|
|
||||||
// typing: "Geox CEO"
|
// typing: "Geox CEO"
|
||||||
//
|
//
|
||||||
// doc0: "Geox SpA: CEO and Executive"
|
// doc0: "Geox SpA: CEO and Executive"
|
||||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||||
#[test]
|
#[test]
|
||||||
fn one_typo_reference() {
|
fn one_typo_reference() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 2),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0, 1];
|
||||||
let matches = vec![
|
let distance1 = &[1, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 1,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 2),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let lhs = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let rhs = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,47 +68,14 @@ mod tests {
|
|||||||
// doc1: "bouton"
|
// doc1: "bouton"
|
||||||
#[test]
|
#[test]
|
||||||
fn no_typo() {
|
fn no_typo() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 1),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0];
|
||||||
let matches = vec![
|
let distance1 = &[0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let lhs = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let rhs = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,47 +85,14 @@ mod tests {
|
|||||||
// doc1: "bouton"
|
// doc1: "bouton"
|
||||||
#[test]
|
#[test]
|
||||||
fn one_typo() {
|
fn one_typo() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 1];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 1,
|
|
||||||
attribute: Attribute::new_faillible(0, 1),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0];
|
||||||
let matches = vec![
|
let distance1 = &[0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let lhs = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let rhs = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
|
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,32 +1,39 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::Match;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attributes(matches: &[Match]) -> usize {
|
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||||
// note that GroupBy will never return an empty group
|
let mut sum_attributes = 0;
|
||||||
// so we can do this assumption safely
|
let mut index = 0;
|
||||||
matches.linear_group_by(match_query_index).map(|group| {
|
|
||||||
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
|
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||||
}).sum()
|
sum_attributes += attribute[index] as usize;
|
||||||
|
index += group.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_attributes
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfWordsAttribute;
|
pub struct SumOfWordsAttribute;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfWordsAttribute
|
impl Criterion for SumOfWordsAttribute {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_attributes(&lhs.matches);
|
let attribute = lhs.attribute();
|
||||||
let rhs = sum_matches_attributes(&rhs.matches);
|
sum_matches_attributes(query_index, attribute)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let attribute = rhs.attribute();
|
||||||
|
sum_matches_attributes(query_index, attribute)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
|
@ -1,32 +1,39 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::Match;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
|
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
|
||||||
// note that GroupBy will never return an empty group
|
let mut sum_word_index = 0;
|
||||||
// so we can do this assumption safely
|
let mut index = 0;
|
||||||
matches.linear_group_by(match_query_index).map(|group| {
|
|
||||||
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
|
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||||
}).sum()
|
sum_word_index += word_index[index] as usize;
|
||||||
|
index += group.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_word_index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfWordsPosition;
|
pub struct SumOfWordsPosition;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfWordsPosition
|
impl Criterion for SumOfWordsPosition {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_attribute_index(&lhs.matches);
|
let word_index = lhs.word_index();
|
||||||
let rhs = sum_matches_attribute_index(&rhs.matches);
|
sum_matches_attribute_index(query_index, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let word_index = rhs.word_index();
|
||||||
|
sum_matches_attribute_index(query_index, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
const MAX_DISTANCE: u32 = 8;
|
const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||||
|
(a.clone(), b.clone())
|
||||||
|
}
|
||||||
|
|
||||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
if lhs < rhs {
|
if lhs < rhs {
|
||||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||||
@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
|
||||||
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
|
if lattr != rattr { return MAX_DISTANCE }
|
||||||
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
|
index_proximity(lwi, rwi)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
|
||||||
let mut min_prox = u32::max_value();
|
let mut min_prox = u32::max_value();
|
||||||
for a in lhs {
|
for a in lattr.iter().zip(lwi) {
|
||||||
for b in rhs {
|
for b in rattr.iter().zip(rwi) {
|
||||||
|
let a = clone_tuple(a);
|
||||||
|
let b = clone_tuple(b);
|
||||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
min_prox
|
min_prox
|
||||||
}
|
}
|
||||||
|
|
||||||
fn matches_proximity(matches: &[Match]) -> u32 {
|
fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 {
|
||||||
let mut proximity = 0;
|
let mut proximity = 0;
|
||||||
let mut iter = matches.linear_group_by(match_query_index);
|
|
||||||
|
|
||||||
// iterate over groups by windows of size 2
|
let mut index = 0;
|
||||||
let mut last = iter.next();
|
let mut iter = query_index.linear_group_by(PartialEq::eq);
|
||||||
|
let mut last = iter.next().map(|group| {
|
||||||
|
let len = group.len();
|
||||||
|
|
||||||
|
let rattr = &attribute[index..index + len];
|
||||||
|
let rwi = &word_index[index..index + len];
|
||||||
|
index += len;
|
||||||
|
|
||||||
|
(rattr, rwi)
|
||||||
|
});
|
||||||
|
|
||||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||||
|
let len = rhs.len();
|
||||||
|
|
||||||
|
let rattr = &attribute[index..index + len];
|
||||||
|
let rwi = &word_index[index..index + len];
|
||||||
|
let rhs = (rattr, rwi);
|
||||||
|
|
||||||
proximity += min_proximity(lhs, rhs);
|
proximity += min_proximity(lhs, rhs);
|
||||||
last = Some(rhs);
|
last = Some(rhs);
|
||||||
|
index += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
proximity
|
proximity
|
||||||
@ -51,18 +70,26 @@ fn matches_proximity(matches: &[Match]) -> u32 {
|
|||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct WordsProximity;
|
pub struct WordsProximity;
|
||||||
|
|
||||||
impl<D> Criterion<D> for WordsProximity
|
impl Criterion for WordsProximity {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = matches_proximity(&lhs.matches);
|
let attribute = lhs.attribute();
|
||||||
let rhs = matches_proximity(&rhs.matches);
|
let word_index = lhs.word_index();
|
||||||
|
matches_proximity(query_index, attribute, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let attribute = rhs.attribute();
|
||||||
|
let word_index = rhs.word_index();
|
||||||
|
matches_proximity(query_index, attribute, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@ -80,18 +107,14 @@ mod tests {
|
|||||||
// { id: 2, attr: 2, attr_index: 0 }
|
// { id: 2, attr: 2, attr_index: 0 }
|
||||||
// { id: 3, attr: 3, attr_index: 1 }
|
// { id: 3, attr: 3, attr_index: 1 }
|
||||||
|
|
||||||
let matches = &[
|
let query_index = &[0, 1, 2, 2, 3];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
let attribute = &[0, 1, 1, 2, 3];
|
||||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
let word_index = &[0, 0, 1, 0, 1];
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
|
|
||||||
];
|
|
||||||
|
|
||||||
// soup -> of = 8
|
// soup -> of = 8
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 8 (not 1)
|
// + the -> day = 8 (not 1)
|
||||||
assert_eq!(matches_proximity(matches), 17);
|
assert_eq!(matches_proximity(query_index, attribute, word_index), 17);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -106,19 +129,14 @@ mod tests {
|
|||||||
// { id: 3, attr: 0, attr_index: 1 }
|
// { id: 3, attr: 0, attr_index: 1 }
|
||||||
// { id: 3, attr: 1, attr_index: 3 }
|
// { id: 3, attr: 1, attr_index: 3 }
|
||||||
|
|
||||||
let matches = &[
|
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
|
|
||||||
];
|
|
||||||
|
|
||||||
// soup -> of = 1
|
// soup -> of = 1
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 1
|
// + the -> day = 1
|
||||||
assert_eq!(matches_proximity(matches), 3);
|
assert_eq!(matches_proximity(query_index, attribute, word_index), 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
180
src/rank/mod.rs
180
src/rank/mod.rs
@ -2,32 +2,182 @@ pub mod criterion;
|
|||||||
mod query_builder;
|
mod query_builder;
|
||||||
mod distinct_map;
|
mod distinct_map;
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use rayon::slice::ParallelSliceMut;
|
||||||
|
|
||||||
use crate::{Match, DocumentId};
|
use crate::{Match, DocumentId};
|
||||||
|
|
||||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||||
|
|
||||||
#[inline]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
|
||||||
a.query_index == b.query_index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: Vec<Match>,
|
pub matches: Vec<Match>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Document {
|
impl Document {
|
||||||
pub fn new(doc: DocumentId, match_: Match) -> Self {
|
pub fn from_raw(raw: &RawDocument) -> Document {
|
||||||
unsafe { Self::from_sorted_matches(doc, vec![match_]) }
|
let len = raw.matches.range.len();
|
||||||
}
|
let mut matches = Vec::with_capacity(len);
|
||||||
|
|
||||||
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
|
let query_index = raw.query_index();
|
||||||
matches.sort_unstable();
|
let distance = raw.distance();
|
||||||
unsafe { Self::from_sorted_matches(doc, matches) }
|
let attribute = raw.attribute();
|
||||||
}
|
let word_index = raw.word_index();
|
||||||
|
let is_exact = raw.is_exact();
|
||||||
|
let char_index = raw.char_index();
|
||||||
|
let char_length = raw.char_length();
|
||||||
|
|
||||||
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
|
for i in 0..len {
|
||||||
Self { id, matches }
|
let match_ = Match {
|
||||||
|
query_index: query_index[i],
|
||||||
|
distance: distance[i],
|
||||||
|
attribute: attribute[i],
|
||||||
|
word_index: word_index[i],
|
||||||
|
is_exact: is_exact[i],
|
||||||
|
char_index: char_index[i],
|
||||||
|
char_length: char_length[i],
|
||||||
|
};
|
||||||
|
matches.push(match_);
|
||||||
|
}
|
||||||
|
|
||||||
|
Document { id: raw.id, matches }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RawDocument {
|
||||||
|
pub id: DocumentId,
|
||||||
|
pub matches: SharedMatches,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawDocument {
|
||||||
|
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
||||||
|
RawDocument { id, matches: SharedMatches { range, matches } }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn query_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn distance(&self) -> &[u8] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attribute(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn word_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_exact(&self) -> &[bool] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn char_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn char_length(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||||
|
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||||
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
|
matches.par_sort_unstable();
|
||||||
|
|
||||||
|
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||||
|
let id = group[0].0;
|
||||||
|
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||||
|
let end = start + group.len();
|
||||||
|
docs_ranges.push((id, Range { start, end }));
|
||||||
|
|
||||||
|
matches2.extend_from_slice(group);
|
||||||
|
}
|
||||||
|
|
||||||
|
let matches = Arc::new(matches2);
|
||||||
|
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
struct Range {
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Range {
|
||||||
|
fn len(self) -> usize {
|
||||||
|
self.end - self.start
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SharedMatches {
|
||||||
|
range: Range,
|
||||||
|
matches: Arc<Matches>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct Matches {
|
||||||
|
query_index: Vec<u32>,
|
||||||
|
distance: Vec<u8>,
|
||||||
|
attribute: Vec<u16>,
|
||||||
|
word_index: Vec<u32>,
|
||||||
|
is_exact: Vec<bool>,
|
||||||
|
char_index: Vec<u32>,
|
||||||
|
char_length: Vec<u16>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matches {
|
||||||
|
fn with_capacity(cap: usize) -> Matches {
|
||||||
|
Matches {
|
||||||
|
query_index: Vec::with_capacity(cap),
|
||||||
|
distance: Vec::with_capacity(cap),
|
||||||
|
attribute: Vec::with_capacity(cap),
|
||||||
|
word_index: Vec::with_capacity(cap),
|
||||||
|
is_exact: Vec::with_capacity(cap),
|
||||||
|
char_index: Vec::with_capacity(cap),
|
||||||
|
char_length: Vec::with_capacity(cap),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
||||||
|
for (_, match_) in matches {
|
||||||
|
self.query_index.push(match_.query_index);
|
||||||
|
self.distance.push(match_.distance);
|
||||||
|
self.attribute.push(match_.attribute);
|
||||||
|
self.word_index.push(match_.word_index);
|
||||||
|
self.is_exact.push(match_.is_exact);
|
||||||
|
self.char_index.push(match_.char_index);
|
||||||
|
self.char_length.push(match_.char_length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,9 @@ use std::error::Error;
|
|||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
|
||||||
|
use rayon::slice::ParallelSliceMut;
|
||||||
use slice_group_by::GroupByMut;
|
use slice_group_by::GroupByMut;
|
||||||
|
use elapsed::measure_time;
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use rocksdb::DB;
|
use rocksdb::DB;
|
||||||
@ -15,7 +17,7 @@ use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
|||||||
use crate::rank::criterion::Criteria;
|
use crate::rank::criterion::Criteria;
|
||||||
use crate::database::DatabaseView;
|
use crate::database::DatabaseView;
|
||||||
use crate::{Match, DocumentId};
|
use crate::{Match, DocumentId};
|
||||||
use crate::rank::Document;
|
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||||
|
|
||||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
@ -81,7 +83,7 @@ where D: Deref<Target=DB>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn query_all(&self, query: &str) -> Vec<Document> {
|
fn query_all(&self, query: &str) -> Vec<RawDocument> {
|
||||||
let automatons = split_whitespace_automatons(query);
|
let automatons = split_whitespace_automatons(query);
|
||||||
|
|
||||||
let mut stream = {
|
let mut stream = {
|
||||||
@ -94,7 +96,7 @@ where D: Deref<Target=DB>,
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut number_matches = 0;
|
let mut number_matches = 0;
|
||||||
let mut matches = HashMap::new();
|
let mut matches = Vec::new();
|
||||||
|
|
||||||
while let Some((input, indexed_values)) = stream.next() {
|
while let Some((input, indexed_values)) = stream.next() {
|
||||||
for iv in indexed_values {
|
for iv in indexed_values {
|
||||||
@ -105,7 +107,6 @@ where D: Deref<Target=DB>,
|
|||||||
let doc_indexes = &self.view.index().positive.indexes();
|
let doc_indexes = &self.view.index().positive.indexes();
|
||||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||||
|
|
||||||
number_matches += doc_indexes.len();
|
|
||||||
for doc_index in doc_indexes {
|
for doc_index in doc_indexes {
|
||||||
let match_ = Match {
|
let match_ = Match {
|
||||||
query_index: iv.index as u32,
|
query_index: iv.index as u32,
|
||||||
@ -116,15 +117,18 @@ where D: Deref<Target=DB>,
|
|||||||
char_index: doc_index.char_index,
|
char_index: doc_index.char_index,
|
||||||
char_length: doc_index.char_length,
|
char_length: doc_index.char_length,
|
||||||
};
|
};
|
||||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
matches.push((doc_index.document_id, match_));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("{} total documents to classify", matches.len());
|
let total_matches = matches.len();
|
||||||
info!("{} total matches to classify", number_matches);
|
let raw_documents = raw_documents_from_matches(matches);
|
||||||
|
|
||||||
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
|
info!("{} total documents to classify", raw_documents.len());
|
||||||
|
info!("{} total matches to classify", total_matches);
|
||||||
|
|
||||||
|
raw_documents
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,7 +144,7 @@ where D: Deref<Target=DB>,
|
|||||||
return builder.query(query, range);
|
return builder.query(query, range);
|
||||||
}
|
}
|
||||||
|
|
||||||
let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query));
|
let (elapsed, mut documents) = measure_time(|| self.query_all(query));
|
||||||
info!("query_all took {}", elapsed);
|
info!("query_all took {}", elapsed);
|
||||||
|
|
||||||
let mut groups = vec![documents.as_mut_slice()];
|
let mut groups = vec![documents.as_mut_slice()];
|
||||||
@ -177,12 +181,9 @@ where D: Deref<Target=DB>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// `drain` removes the documents efficiently using `ptr::copy`
|
|
||||||
// TODO it could be more efficient to have a custom iterator
|
|
||||||
let offset = cmp::min(documents.len(), range.start);
|
let offset = cmp::min(documents.len(), range.start);
|
||||||
documents.drain(0..offset);
|
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||||
documents.truncate(range.len());
|
iter.map(|d| Document::from_raw(&d)).collect()
|
||||||
documents
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,7 +216,9 @@ where D: Deref<Target=DB>,
|
|||||||
K: Hash + Eq,
|
K: Hash + Eq,
|
||||||
{
|
{
|
||||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||||
let mut documents = self.inner.query_all(query);
|
let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query));
|
||||||
|
info!("query_all took {}", elapsed);
|
||||||
|
|
||||||
let mut groups = vec![documents.as_mut_slice()];
|
let mut groups = vec![documents.as_mut_slice()];
|
||||||
let mut key_cache = HashMap::new();
|
let mut key_cache = HashMap::new();
|
||||||
let view = &self.inner.view;
|
let view = &self.inner.view;
|
||||||
@ -227,12 +230,14 @@ where D: Deref<Target=DB>,
|
|||||||
let mut distinct_map = DistinctMap::new(self.size);
|
let mut distinct_map = DistinctMap::new(self.size);
|
||||||
let mut distinct_raw_offset = 0;
|
let mut distinct_raw_offset = 0;
|
||||||
|
|
||||||
'criteria: for criterion in self.inner.criteria.as_ref() {
|
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
|
||||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||||
let mut documents_seen = 0;
|
let mut documents_seen = 0;
|
||||||
|
|
||||||
for group in tmp_groups {
|
for group in tmp_groups {
|
||||||
|
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||||
|
|
||||||
// if this group does not overlap with the requested range,
|
// if this group does not overlap with the requested range,
|
||||||
// push it without sorting and splitting it
|
// push it without sorting and splitting it
|
||||||
if documents_seen + group.len() < distinct_raw_offset {
|
if documents_seen + group.len() < distinct_raw_offset {
|
||||||
@ -241,9 +246,12 @@ where D: Deref<Target=DB>,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
let (elapsed, _) = measure_time(|| {
|
||||||
|
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||||
|
});
|
||||||
|
info!("criterion {} sort took {}", ci, elapsed);
|
||||||
|
|
||||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) {
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||||
// we must compute the real distinguished len of this sub-group
|
// we must compute the real distinguished len of this sub-group
|
||||||
for document in group.iter() {
|
for document in group.iter() {
|
||||||
let filter_accepted = match &self.inner.filter {
|
let filter_accepted = match &self.inner.filter {
|
||||||
@ -302,7 +310,7 @@ where D: Deref<Target=DB>,
|
|||||||
};
|
};
|
||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
if distinct_accepted && seen.len() > range.start {
|
||||||
out_documents.push(document);
|
out_documents.push(Document::from_raw(&document));
|
||||||
if out_documents.len() == range.len() { break }
|
if out_documents.len() == range.len() { break }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user