mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
Merge pull request #170 from meilisearch/async-word-index-fetching-with-rayon-scope
Async word index fetching with rayon scope
This commit is contained in:
commit
bae86e978e
@ -6,6 +6,7 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.3.1"
|
||||
crossbeam-channel = "0.3.9"
|
||||
deunicode = "1.0.0"
|
||||
hashbrown = "0.2.2"
|
||||
lazy_static = "1.2.0"
|
||||
@ -14,7 +15,7 @@ meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||
rayon = "1.0.3"
|
||||
sdset = "0.3.2"
|
||||
serde = { version = "1.0.88", features = ["derive"] }
|
||||
slice-group-by = "0.2.4"
|
||||
slice-group-by = "0.2.6"
|
||||
zerocopy = "0.2.2"
|
||||
|
||||
[dependencies.fst]
|
||||
|
@ -21,7 +21,7 @@ fn custom_log10(n: u8) -> f32 {
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words = 0;
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
|
@ -1,22 +1,24 @@
|
||||
#![feature(checked_duration_since)]
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use] extern crate assert_matches;
|
||||
|
||||
mod automaton;
|
||||
mod distinct_map;
|
||||
mod query_builder;
|
||||
mod query_enhancer;
|
||||
mod raw_document;
|
||||
mod reordered_attrs;
|
||||
mod store;
|
||||
pub mod criterion;
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sdset::SetBuf;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use slice_group_by::GroupBy;
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
use self::raw_document::raw_documents_from;
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||
pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Store;
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
@ -130,132 +132,6 @@ impl Document {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
pub highlights: Vec<Highlight>,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
||||
RawDocument { id, matches, highlights }
|
||||
}
|
||||
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RawDocument {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("RawDocument")
|
||||
.field("id", &self.id)
|
||||
.field("query_index", &self.query_index())
|
||||
.field("distance", &self.distance())
|
||||
.field("attribute", &self.attribute())
|
||||
.field("word_index", &self.word_index())
|
||||
.field("is_exact", &self.is_exact())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) {
|
||||
let document_id = group[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
||||
let end = start + group.len();
|
||||
|
||||
let highlights = group.iter().map(|(_, _, h)| *h).collect();
|
||||
docs_ranges.push((document_id, Range { start, end }, highlights));
|
||||
|
||||
matches2.extend_from_slice(group);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(i, range, highlights)| {
|
||||
let matches = SharedMatches { range, matches: matches.clone() };
|
||||
RawDocument::new(i, matches, highlights)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u16>,
|
||||
is_exact: Vec<bool>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) {
|
||||
for (_, match_, _) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
File diff suppressed because it is too large
Load Diff
398
meilidb-core/src/query_enhancer.rs
Normal file
398
meilidb-core/src/query_enhancer.rs
Normal file
@ -0,0 +1,398 @@
|
||||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
141
meilidb-core/src/raw_document.rs
Normal file
141
meilidb-core/src/raw_document.rs
Normal file
@ -0,0 +1,141 @@
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::{TmpMatch, DocumentId, Highlight};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
pub highlights: Vec<Highlight>,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
||||
RawDocument { id, matches, highlights }
|
||||
}
|
||||
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RawDocument {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("RawDocument {\r\n")?;
|
||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
||||
f.write_str("}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from(
|
||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||
) -> Vec<RawDocument>
|
||||
{
|
||||
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||
|
||||
for (mgroup, hgroup) in matches.zip(highlights) {
|
||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||
|
||||
let document_id = mgroup[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
||||
let end = start + mgroup.len();
|
||||
|
||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||
docs_ranges.push((document_id, Range { start, end }, highlights));
|
||||
|
||||
matches2.extend_from_slice(mgroup);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(id, range, highlights)| {
|
||||
let matches = SharedMatches { range, matches: matches.clone() };
|
||||
RawDocument::new(id, matches, highlights)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u16>,
|
||||
is_exact: Vec<bool>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ReorderedAttrs {
|
||||
count: usize,
|
||||
reorders: Vec<Option<u16>>,
|
||||
|
@ -21,10 +21,10 @@ impl<'a> SynonymsAddition<'a> {
|
||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item=T>,
|
||||
I: IntoIterator<Item=T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
||||
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||
}
|
||||
|
||||
@ -73,7 +73,7 @@ impl<'a> SynonymsAddition<'a> {
|
||||
|
||||
// update the "consistent" view of the Index
|
||||
let words = main.words_set()?.unwrap_or_default();
|
||||
let ranked_map = lease_inner.ranked_map.clone();;
|
||||
let ranked_map = lease_inner.ranked_map.clone();
|
||||
let schema = lease_inner.schema.clone();
|
||||
let raw = lease_inner.raw.clone();
|
||||
lease_inner.raw.compact();
|
||||
|
@ -14,10 +14,12 @@ csv = "1.0.7"
|
||||
diskus = "0.5.0"
|
||||
env_logger = "0.6.1"
|
||||
jemallocator = "0.1.9"
|
||||
linked-hash-map = "0.5.2"
|
||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||
quickcheck = "0.8.2"
|
||||
rand = "0.6.5"
|
||||
rand_xorshift = "0.1.1"
|
||||
rustyline = { version = "5.0.0", default-features = false }
|
||||
serde = { version = "1.0.91" , features = ["derive"] }
|
||||
serde_json = "1.0.39"
|
||||
structopt = "0.2.15"
|
||||
|
@ -31,9 +31,13 @@ pub struct Opt {
|
||||
#[structopt(long = "schema", parse(from_os_str))]
|
||||
pub schema_path: PathBuf,
|
||||
|
||||
/// The file with the synonyms.
|
||||
#[structopt(long = "synonyms", parse(from_os_str))]
|
||||
pub synonyms: Option<PathBuf>,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words_path: Option<PathBuf>,
|
||||
pub stop_words: Option<PathBuf>,
|
||||
|
||||
#[structopt(long = "update-group-size")]
|
||||
pub update_group_size: Option<usize>,
|
||||
@ -45,12 +49,40 @@ struct Document<'a> (
|
||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||
);
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum Synonym {
|
||||
OneWay(SynonymOneWay),
|
||||
MultiWay { synonyms: Vec<String> },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SynonymOneWay {
|
||||
pub search_terms: String,
|
||||
pub synonyms: Synonyms,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum Synonyms {
|
||||
Multiple(Vec<String>),
|
||||
Single(String),
|
||||
}
|
||||
|
||||
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
|
||||
let file = File::open(path)?;
|
||||
let synonyms = serde_json::from_reader(file)?;
|
||||
Ok(synonyms)
|
||||
}
|
||||
|
||||
fn index(
|
||||
schema: Schema,
|
||||
database_path: &Path,
|
||||
csv_data_path: &Path,
|
||||
update_group_size: Option<usize>,
|
||||
stop_words: &HashSet<String>,
|
||||
synonyms: Vec<Synonym>,
|
||||
) -> Result<Database, Box<dyn Error>>
|
||||
{
|
||||
let database = Database::start_default(database_path)?;
|
||||
@ -62,6 +94,28 @@ fn index(
|
||||
|
||||
let index = database.create_index("test", schema.clone())?;
|
||||
|
||||
let mut synonyms_adder = index.synonyms_addition();
|
||||
for synonym in synonyms {
|
||||
match synonym {
|
||||
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
|
||||
let alternatives = match synonyms {
|
||||
Synonyms::Multiple(alternatives) => alternatives,
|
||||
Synonyms::Single(alternative) => vec![alternative],
|
||||
};
|
||||
synonyms_adder.add_synonym(search_terms, alternatives);
|
||||
},
|
||||
Synonym::MultiWay { mut synonyms } => {
|
||||
for _ in 0..synonyms.len() {
|
||||
if let Some((synonym, alternatives)) = synonyms.split_first() {
|
||||
synonyms_adder.add_synonym(synonym, alternatives);
|
||||
}
|
||||
synonyms.rotate_left(1);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
synonyms_adder.finalize()?;
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
@ -133,13 +187,25 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
Schema::from_toml(file)?
|
||||
};
|
||||
|
||||
let stop_words = match opt.stop_words_path {
|
||||
let stop_words = match opt.stop_words {
|
||||
Some(ref path) => retrieve_stop_words(path)?,
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let synonyms = match opt.synonyms {
|
||||
Some(ref path) => read_synomys(path)?,
|
||||
None => Vec::new(),
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
|
||||
let result = index(
|
||||
schema,
|
||||
&opt.database_path,
|
||||
&opt.csv_data_path,
|
||||
opt.update_group_size,
|
||||
&stop_words,
|
||||
synonyms,
|
||||
);
|
||||
|
||||
if let Err(e) = result {
|
||||
return Err(e.into())
|
||||
|
@ -2,17 +2,19 @@
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::iter::FromIterator;
|
||||
use std::io::{self, Write};
|
||||
use std::time::{Instant, Duration};
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashSet;
|
||||
use std::error::Error;
|
||||
use std::io::{self, Write};
|
||||
use std::iter::FromIterator;
|
||||
use std::path::PathBuf;
|
||||
use std::time::{Instant, Duration};
|
||||
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use rustyline::{Editor, Config};
|
||||
use structopt::StructOpt;
|
||||
use meilidb_core::Highlight;
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
|
||||
use meilidb_core::Highlight;
|
||||
use meilidb_data::Database;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
@ -22,6 +24,9 @@ pub struct Opt {
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
#[structopt(long = "fetch-timeout-ms")]
|
||||
pub fetch_timeout_ms: Option<u64>,
|
||||
|
||||
/// Fields that must be displayed.
|
||||
pub displayed_fields: Vec<String>,
|
||||
|
||||
@ -34,7 +39,7 @@ pub struct Opt {
|
||||
pub char_context: usize,
|
||||
}
|
||||
|
||||
type Document = HashMap<String, String>;
|
||||
type Document = LinkedHashMap<String, String>;
|
||||
|
||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||
@ -140,9 +145,6 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
let start = Instant::now();
|
||||
let database = Database::start_default(&opt.database_path)?;
|
||||
|
||||
let mut buffer = String::new();
|
||||
let input = io::stdin();
|
||||
|
||||
let index = database.open_index("test")?.unwrap();
|
||||
let schema = index.schema();
|
||||
|
||||
@ -151,65 +153,77 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
let fields = opt.displayed_fields.iter().map(String::as_str);
|
||||
let fields = HashSet::from_iter(fields);
|
||||
|
||||
loop {
|
||||
print!("Searching for: ");
|
||||
io::stdout().flush()?;
|
||||
let config = Config::builder().auto_add_history(true).build();
|
||||
let mut readline = Editor::<()>::with_config(config);
|
||||
let _ = readline.load_history("query-history.txt");
|
||||
|
||||
if input.read_line(&mut buffer)? == 0 { break }
|
||||
let query = buffer.trim_end_matches('\n');
|
||||
for result in readline.iter("Searching for: ") {
|
||||
match result {
|
||||
Ok(query) => {
|
||||
let start_total = Instant::now();
|
||||
|
||||
let start_total = Instant::now();
|
||||
let builder = match opt.fetch_timeout_ms {
|
||||
Some(timeout_ms) => {
|
||||
let timeout = Duration::from_millis(timeout_ms);
|
||||
index.query_builder().with_fetch_timeout(timeout)
|
||||
},
|
||||
None => index.query_builder(),
|
||||
};
|
||||
let documents = builder.query(&query, 0..opt.number_results)?;
|
||||
|
||||
let builder = index.query_builder();
|
||||
let documents = builder.query(query, 0..opt.number_results)?;
|
||||
let mut retrieve_duration = Duration::default();
|
||||
|
||||
let mut retrieve_duration = Duration::default();
|
||||
let number_of_documents = documents.len();
|
||||
for mut doc in documents {
|
||||
|
||||
let number_of_documents = documents.len();
|
||||
for mut doc in documents {
|
||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
|
||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
let start_retrieve = Instant::now();
|
||||
let result = index.document::<Document>(Some(&fields), doc.id);
|
||||
retrieve_duration += start_retrieve.elapsed();
|
||||
|
||||
let start_retrieve = Instant::now();
|
||||
let result = index.document::<Document>(Some(&fields), doc.id);
|
||||
retrieve_duration += start_retrieve.elapsed();
|
||||
match result {
|
||||
Ok(Some(document)) => {
|
||||
for (name, text) in document {
|
||||
print!("{}: ", name);
|
||||
|
||||
match result {
|
||||
Ok(Some(document)) => {
|
||||
for (name, text) in document {
|
||||
print!("{}: ", name);
|
||||
|
||||
let attr = schema.attribute(&name).unwrap();
|
||||
let highlights = doc.highlights.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
|
||||
let areas = create_highlight_areas(&text, &highlights);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
let attr = schema.attribute(&name).unwrap();
|
||||
let highlights = doc.highlights.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
|
||||
let areas = create_highlight_areas(&text, &highlights);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
Ok(None) => eprintln!("missing document"),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
},
|
||||
Ok(None) => eprintln!("missing document"),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for highlight in doc.highlights {
|
||||
let attr = SchemaAttr::new(highlight.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
||||
},
|
||||
Err(err) => {
|
||||
println!("Error: {:?}", err);
|
||||
break
|
||||
}
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for highlight in doc.highlights {
|
||||
let attr = SchemaAttr::new(highlight.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
readline.save_history("query-history.txt").unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user