mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
Merge pull request #170 from meilisearch/async-word-index-fetching-with-rayon-scope
Async word index fetching with rayon scope
This commit is contained in:
commit
bae86e978e
@ -6,6 +6,7 @@ edition = "2018"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
byteorder = "1.3.1"
|
byteorder = "1.3.1"
|
||||||
|
crossbeam-channel = "0.3.9"
|
||||||
deunicode = "1.0.0"
|
deunicode = "1.0.0"
|
||||||
hashbrown = "0.2.2"
|
hashbrown = "0.2.2"
|
||||||
lazy_static = "1.2.0"
|
lazy_static = "1.2.0"
|
||||||
@ -14,7 +15,7 @@ meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
|||||||
rayon = "1.0.3"
|
rayon = "1.0.3"
|
||||||
sdset = "0.3.2"
|
sdset = "0.3.2"
|
||||||
serde = { version = "1.0.88", features = ["derive"] }
|
serde = { version = "1.0.88", features = ["derive"] }
|
||||||
slice-group-by = "0.2.4"
|
slice-group-by = "0.2.6"
|
||||||
zerocopy = "0.2.2"
|
zerocopy = "0.2.2"
|
||||||
|
|
||||||
[dependencies.fst]
|
[dependencies.fst]
|
||||||
|
@ -21,7 +21,7 @@ fn custom_log10(n: u8) -> f32 {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||||
let mut number_words = 0;
|
let mut number_words: usize = 0;
|
||||||
let mut sum_typos = 0.0;
|
let mut sum_typos = 0.0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
|
|
||||||
|
@ -1,22 +1,24 @@
|
|||||||
|
#![feature(checked_duration_since)]
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[macro_use] extern crate assert_matches;
|
#[macro_use] extern crate assert_matches;
|
||||||
|
|
||||||
mod automaton;
|
mod automaton;
|
||||||
mod distinct_map;
|
mod distinct_map;
|
||||||
mod query_builder;
|
mod query_builder;
|
||||||
|
mod query_enhancer;
|
||||||
|
mod raw_document;
|
||||||
mod reordered_attrs;
|
mod reordered_attrs;
|
||||||
mod store;
|
mod store;
|
||||||
pub mod criterion;
|
pub mod criterion;
|
||||||
|
|
||||||
use std::fmt;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use sdset::SetBuf;
|
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Serialize, Deserialize};
|
||||||
use slice_group_by::GroupBy;
|
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
|
use self::raw_document::raw_documents_from;
|
||||||
|
|
||||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||||
|
pub use self::raw_document::RawDocument;
|
||||||
pub use self::store::Store;
|
pub use self::store::Store;
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
/// Represent an internally generated document unique identifier.
|
||||||
@ -130,132 +132,6 @@ impl Document {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct RawDocument {
|
|
||||||
pub id: DocumentId,
|
|
||||||
pub matches: SharedMatches,
|
|
||||||
pub highlights: Vec<Highlight>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RawDocument {
|
|
||||||
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
|
||||||
RawDocument { id, matches, highlights }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn query_index(&self) -> &[u32] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn distance(&self) -> &[u8] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn attribute(&self) -> &[u16] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn word_index(&self) -> &[u16] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_exact(&self) -> &[bool] {
|
|
||||||
let r = self.matches.range;
|
|
||||||
// it is safe because construction/modifications
|
|
||||||
// can only be done in this module
|
|
||||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for RawDocument {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
f.debug_struct("RawDocument")
|
|
||||||
.field("id", &self.id)
|
|
||||||
.field("query_index", &self.query_index())
|
|
||||||
.field("distance", &self.distance())
|
|
||||||
.field("attribute", &self.attribute())
|
|
||||||
.field("word_index", &self.word_index())
|
|
||||||
.field("is_exact", &self.is_exact())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec<RawDocument> {
|
|
||||||
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
|
||||||
|
|
||||||
for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) {
|
|
||||||
let document_id = group[0].0;
|
|
||||||
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
|
||||||
let end = start + group.len();
|
|
||||||
|
|
||||||
let highlights = group.iter().map(|(_, _, h)| *h).collect();
|
|
||||||
docs_ranges.push((document_id, Range { start, end }, highlights));
|
|
||||||
|
|
||||||
matches2.extend_from_slice(group);
|
|
||||||
}
|
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
|
||||||
docs_ranges.into_iter().map(|(i, range, highlights)| {
|
|
||||||
let matches = SharedMatches { range, matches: matches.clone() };
|
|
||||||
RawDocument::new(i, matches, highlights)
|
|
||||||
}).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
|
||||||
struct Range {
|
|
||||||
start: usize,
|
|
||||||
end: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct SharedMatches {
|
|
||||||
range: Range,
|
|
||||||
matches: Arc<Matches>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct Matches {
|
|
||||||
query_index: Vec<u32>,
|
|
||||||
distance: Vec<u8>,
|
|
||||||
attribute: Vec<u16>,
|
|
||||||
word_index: Vec<u16>,
|
|
||||||
is_exact: Vec<bool>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Matches {
|
|
||||||
fn with_capacity(cap: usize) -> Matches {
|
|
||||||
Matches {
|
|
||||||
query_index: Vec::with_capacity(cap),
|
|
||||||
distance: Vec::with_capacity(cap),
|
|
||||||
attribute: Vec::with_capacity(cap),
|
|
||||||
word_index: Vec::with_capacity(cap),
|
|
||||||
is_exact: Vec::with_capacity(cap),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) {
|
|
||||||
for (_, match_, _) in matches {
|
|
||||||
self.query_index.push(match_.query_index);
|
|
||||||
self.distance.push(match_.distance);
|
|
||||||
self.attribute.push(match_.attribute);
|
|
||||||
self.word_index.push(match_.word_index);
|
|
||||||
self.is_exact.push(match_.is_exact);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
File diff suppressed because it is too large
Load Diff
398
meilidb-core/src/query_enhancer.rs
Normal file
398
meilidb-core/src/query_enhancer.rs
Normal file
@ -0,0 +1,398 @@
|
|||||||
|
use std::ops::Range;
|
||||||
|
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||||
|
|
||||||
|
/// Return `true` if the specified range can accept the given replacements words.
|
||||||
|
/// Returns `false` if the replacements words are already present in the original query
|
||||||
|
/// or if there is fewer replacement words than the range to replace.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// ## Ignored because already present in original
|
||||||
|
//
|
||||||
|
// new york city subway
|
||||||
|
// -------- ^^^^
|
||||||
|
// / \
|
||||||
|
// [new york city]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// ## Ignored because smaller than the original
|
||||||
|
//
|
||||||
|
// new york city subway
|
||||||
|
// -------------
|
||||||
|
// \ /
|
||||||
|
// [new york]
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// ## Accepted because bigger than the original
|
||||||
|
//
|
||||||
|
// NYC subway
|
||||||
|
// ---
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// / \
|
||||||
|
// [new york city]
|
||||||
|
//
|
||||||
|
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||||
|
where S: AsRef<str>,
|
||||||
|
T: AsRef<str>,
|
||||||
|
{
|
||||||
|
if words.len() <= range.len() {
|
||||||
|
// there is fewer or equal replacement words
|
||||||
|
// than there is already in the replaced range
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// retrieve the part to rewrite but with the length
|
||||||
|
// of the replacement part
|
||||||
|
let original = query.iter().skip(range.start).take(words.len());
|
||||||
|
|
||||||
|
// check if the original query doesn't already contain
|
||||||
|
// the replacement words
|
||||||
|
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||||
|
}
|
||||||
|
|
||||||
|
type Origin = usize;
|
||||||
|
type RealLength = usize;
|
||||||
|
|
||||||
|
struct FakeIntervalTree {
|
||||||
|
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FakeIntervalTree {
|
||||||
|
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||||
|
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||||
|
FakeIntervalTree { intervals }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||||
|
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||||
|
if point >= r.start {
|
||||||
|
if point < r.end { Equal } else { Less }
|
||||||
|
} else { Greater }
|
||||||
|
});
|
||||||
|
|
||||||
|
let n = match element { Ok(n) => n, Err(n) => n };
|
||||||
|
|
||||||
|
match self.intervals.get(n) {
|
||||||
|
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||||
|
_otherwise => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct QueryEnhancerBuilder<'a, S> {
|
||||||
|
query: &'a [S],
|
||||||
|
origins: Vec<usize>,
|
||||||
|
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||||
|
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||||
|
// we initialize origins query indices based on their positions
|
||||||
|
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||||
|
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||||
|
|
||||||
|
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update the final real to origin query indices mapping.
|
||||||
|
///
|
||||||
|
/// `range` is the original words range that this `replacement` words replace
|
||||||
|
/// and `real` is the first real query index of these replacement words.
|
||||||
|
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||||
|
where T: AsRef<str>,
|
||||||
|
{
|
||||||
|
// check if the range of original words
|
||||||
|
// can be rewritten with the replacement words
|
||||||
|
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||||
|
|
||||||
|
// this range can be replaced so we need to
|
||||||
|
// modify the origins accordingly
|
||||||
|
let offset = replacement.len() - range.len();
|
||||||
|
|
||||||
|
let previous_padding = self.origins[range.end - 1];
|
||||||
|
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||||
|
let diff = offset.saturating_sub(current_offset);
|
||||||
|
self.origins[range.end] += diff;
|
||||||
|
|
||||||
|
for r in &mut self.origins[range.end + 1..] {
|
||||||
|
*r += diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we need to store the real number and origins relations
|
||||||
|
// this way it will be possible to know by how many
|
||||||
|
// we need to pad real query indices
|
||||||
|
let real_range = real..real + replacement.len().max(range.len());
|
||||||
|
let real_length = replacement.len();
|
||||||
|
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> QueryEnhancer {
|
||||||
|
QueryEnhancer {
|
||||||
|
origins: self.origins,
|
||||||
|
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct QueryEnhancer {
|
||||||
|
origins: Vec<usize>,
|
||||||
|
real_to_origin: FakeIntervalTree,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryEnhancer {
|
||||||
|
/// Returns the query indices to use to replace this real query index.
|
||||||
|
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||||
|
let real = real as usize;
|
||||||
|
|
||||||
|
// query the fake interval tree with the real query index
|
||||||
|
let (range, (origin, real_length)) =
|
||||||
|
self.real_to_origin
|
||||||
|
.query(real)
|
||||||
|
.expect("real has never been declared");
|
||||||
|
|
||||||
|
// if `real` is the end bound of the range
|
||||||
|
if (range.start + real_length - 1) == real {
|
||||||
|
let mut count = range.len();
|
||||||
|
let mut new_origin = origin;
|
||||||
|
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||||
|
let len = slice[1] - slice[0];
|
||||||
|
count = count.saturating_sub(len);
|
||||||
|
if count == 0 { new_origin = origin + i; break }
|
||||||
|
}
|
||||||
|
|
||||||
|
let n = real - range.start;
|
||||||
|
let start = self.origins[origin];
|
||||||
|
let end = self.origins[new_origin + 1];
|
||||||
|
let remaining = (end - start) - n;
|
||||||
|
|
||||||
|
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// just return the origin along with
|
||||||
|
// the real position of the word
|
||||||
|
let n = real as usize - range.start;
|
||||||
|
let origin = self.origins[origin];
|
||||||
|
|
||||||
|
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified() {
|
||||||
|
let query = ["new", "york", "city", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||||
|
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||||
|
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_growing() {
|
||||||
|
let query = ["new", "york", "subway"];
|
||||||
|
// 0 1 2
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||||
|
// ^ 3 4 5
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||||
|
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||||
|
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn same_place_growings() {
|
||||||
|
let query = ["NY", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NY = new york
|
||||||
|
builder.declare(0..1, 2, &["new", "york"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// NY = NYC
|
||||||
|
builder.declare(0..1, 7, &["NYC"]);
|
||||||
|
// ^ 7
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||||
|
// ^ 8 9 10
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(1..2, 11, &["underground", "train"]);
|
||||||
|
// ^ 11 12
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||||
|
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||||
|
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||||
|
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||||
|
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||||
|
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||||
|
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bigger_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||||
|
// ^ 2 3 4
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||||
|
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||||
|
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||||
|
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn middle_query_growing() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||||
|
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||||
|
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||||
|
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||||
|
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn end_query_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(1..2, 2, &["underground", "train"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||||
|
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||||
|
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||||
|
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||||
|
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||||
|
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||||
|
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||||
|
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||||
|
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_probable_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
// great awesome = good
|
||||||
|
builder.declare(0..2, 9, &["good"]);
|
||||||
|
// ^ 9
|
||||||
|
|
||||||
|
// awesome NYC = NY
|
||||||
|
builder.declare(1..3, 10, &["NY"]);
|
||||||
|
// ^^ 10
|
||||||
|
|
||||||
|
// NYC subway = metro
|
||||||
|
builder.declare(2..4, 11, &["metro"]);
|
||||||
|
// ^^ 11
|
||||||
|
|
||||||
|
let enhancer = builder.build();
|
||||||
|
|
||||||
|
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||||
|
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||||
|
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||||
|
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||||
|
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||||
|
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||||
|
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||||
|
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||||
|
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||||
|
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||||
|
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||||
|
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||||
|
}
|
||||||
|
}
|
141
meilidb-core/src/raw_document.rs
Normal file
141
meilidb-core/src/raw_document.rs
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
use std::fmt;
|
||||||
|
use sdset::SetBuf;
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::{TmpMatch, DocumentId, Highlight};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RawDocument {
|
||||||
|
pub id: DocumentId,
|
||||||
|
pub matches: SharedMatches,
|
||||||
|
pub highlights: Vec<Highlight>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawDocument {
|
||||||
|
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
|
||||||
|
RawDocument { id, matches, highlights }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn query_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn distance(&self) -> &[u8] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attribute(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn word_index(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_exact(&self) -> &[bool] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for RawDocument {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
f.write_str("RawDocument {\r\n")?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
||||||
|
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
||||||
|
f.write_str("}")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn raw_documents_from(
|
||||||
|
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||||
|
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||||
|
) -> Vec<RawDocument>
|
||||||
|
{
|
||||||
|
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
|
||||||
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
|
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||||
|
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||||
|
|
||||||
|
for (mgroup, hgroup) in matches.zip(highlights) {
|
||||||
|
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||||
|
|
||||||
|
let document_id = mgroup[0].0;
|
||||||
|
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
|
||||||
|
let end = start + mgroup.len();
|
||||||
|
|
||||||
|
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||||
|
docs_ranges.push((document_id, Range { start, end }, highlights));
|
||||||
|
|
||||||
|
matches2.extend_from_slice(mgroup);
|
||||||
|
}
|
||||||
|
|
||||||
|
let matches = Arc::new(matches2);
|
||||||
|
docs_ranges.into_iter().map(|(id, range, highlights)| {
|
||||||
|
let matches = SharedMatches { range, matches: matches.clone() };
|
||||||
|
RawDocument::new(id, matches, highlights)
|
||||||
|
}).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
struct Range {
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SharedMatches {
|
||||||
|
range: Range,
|
||||||
|
matches: Arc<Matches>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct Matches {
|
||||||
|
query_index: Vec<u32>,
|
||||||
|
distance: Vec<u8>,
|
||||||
|
attribute: Vec<u16>,
|
||||||
|
word_index: Vec<u16>,
|
||||||
|
is_exact: Vec<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matches {
|
||||||
|
fn with_capacity(cap: usize) -> Matches {
|
||||||
|
Matches {
|
||||||
|
query_index: Vec::with_capacity(cap),
|
||||||
|
distance: Vec::with_capacity(cap),
|
||||||
|
attribute: Vec::with_capacity(cap),
|
||||||
|
word_index: Vec::with_capacity(cap),
|
||||||
|
is_exact: Vec::with_capacity(cap),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
|
||||||
|
for (_, match_) in matches {
|
||||||
|
self.query_index.push(match_.query_index);
|
||||||
|
self.distance.push(match_.distance);
|
||||||
|
self.attribute.push(match_.attribute);
|
||||||
|
self.word_index.push(match_.word_index);
|
||||||
|
self.is_exact.push(match_.is_exact);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
#[derive(Default)]
|
#[derive(Default, Clone)]
|
||||||
pub struct ReorderedAttrs {
|
pub struct ReorderedAttrs {
|
||||||
count: usize,
|
count: usize,
|
||||||
reorders: Vec<Option<u16>>,
|
reorders: Vec<Option<u16>>,
|
||||||
|
@ -21,10 +21,10 @@ impl<'a> SynonymsAddition<'a> {
|
|||||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||||
where S: AsRef<str>,
|
where S: AsRef<str>,
|
||||||
T: AsRef<str>,
|
T: AsRef<str>,
|
||||||
I: Iterator<Item=T>,
|
I: IntoIterator<Item=T>,
|
||||||
{
|
{
|
||||||
let synonym = normalize_str(synonym.as_ref());
|
let synonym = normalize_str(synonym.as_ref());
|
||||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,7 +73,7 @@ impl<'a> SynonymsAddition<'a> {
|
|||||||
|
|
||||||
// update the "consistent" view of the Index
|
// update the "consistent" view of the Index
|
||||||
let words = main.words_set()?.unwrap_or_default();
|
let words = main.words_set()?.unwrap_or_default();
|
||||||
let ranked_map = lease_inner.ranked_map.clone();;
|
let ranked_map = lease_inner.ranked_map.clone();
|
||||||
let schema = lease_inner.schema.clone();
|
let schema = lease_inner.schema.clone();
|
||||||
let raw = lease_inner.raw.clone();
|
let raw = lease_inner.raw.clone();
|
||||||
lease_inner.raw.compact();
|
lease_inner.raw.compact();
|
||||||
|
@ -14,10 +14,12 @@ csv = "1.0.7"
|
|||||||
diskus = "0.5.0"
|
diskus = "0.5.0"
|
||||||
env_logger = "0.6.1"
|
env_logger = "0.6.1"
|
||||||
jemallocator = "0.1.9"
|
jemallocator = "0.1.9"
|
||||||
|
linked-hash-map = "0.5.2"
|
||||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||||
quickcheck = "0.8.2"
|
quickcheck = "0.8.2"
|
||||||
rand = "0.6.5"
|
rand = "0.6.5"
|
||||||
rand_xorshift = "0.1.1"
|
rand_xorshift = "0.1.1"
|
||||||
|
rustyline = { version = "5.0.0", default-features = false }
|
||||||
serde = { version = "1.0.91" , features = ["derive"] }
|
serde = { version = "1.0.91" , features = ["derive"] }
|
||||||
serde_json = "1.0.39"
|
serde_json = "1.0.39"
|
||||||
structopt = "0.2.15"
|
structopt = "0.2.15"
|
||||||
|
@ -31,9 +31,13 @@ pub struct Opt {
|
|||||||
#[structopt(long = "schema", parse(from_os_str))]
|
#[structopt(long = "schema", parse(from_os_str))]
|
||||||
pub schema_path: PathBuf,
|
pub schema_path: PathBuf,
|
||||||
|
|
||||||
|
/// The file with the synonyms.
|
||||||
|
#[structopt(long = "synonyms", parse(from_os_str))]
|
||||||
|
pub synonyms: Option<PathBuf>,
|
||||||
|
|
||||||
/// The path to the list of stop words (one by line).
|
/// The path to the list of stop words (one by line).
|
||||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||||
pub stop_words_path: Option<PathBuf>,
|
pub stop_words: Option<PathBuf>,
|
||||||
|
|
||||||
#[structopt(long = "update-group-size")]
|
#[structopt(long = "update-group-size")]
|
||||||
pub update_group_size: Option<usize>,
|
pub update_group_size: Option<usize>,
|
||||||
@ -45,12 +49,40 @@ struct Document<'a> (
|
|||||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
pub enum Synonym {
|
||||||
|
OneWay(SynonymOneWay),
|
||||||
|
MultiWay { synonyms: Vec<String> },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct SynonymOneWay {
|
||||||
|
pub search_terms: String,
|
||||||
|
pub synonyms: Synonyms,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
pub enum Synonyms {
|
||||||
|
Multiple(Vec<String>),
|
||||||
|
Single(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
|
||||||
|
let file = File::open(path)?;
|
||||||
|
let synonyms = serde_json::from_reader(file)?;
|
||||||
|
Ok(synonyms)
|
||||||
|
}
|
||||||
|
|
||||||
fn index(
|
fn index(
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
database_path: &Path,
|
database_path: &Path,
|
||||||
csv_data_path: &Path,
|
csv_data_path: &Path,
|
||||||
update_group_size: Option<usize>,
|
update_group_size: Option<usize>,
|
||||||
stop_words: &HashSet<String>,
|
stop_words: &HashSet<String>,
|
||||||
|
synonyms: Vec<Synonym>,
|
||||||
) -> Result<Database, Box<dyn Error>>
|
) -> Result<Database, Box<dyn Error>>
|
||||||
{
|
{
|
||||||
let database = Database::start_default(database_path)?;
|
let database = Database::start_default(database_path)?;
|
||||||
@ -62,6 +94,28 @@ fn index(
|
|||||||
|
|
||||||
let index = database.create_index("test", schema.clone())?;
|
let index = database.create_index("test", schema.clone())?;
|
||||||
|
|
||||||
|
let mut synonyms_adder = index.synonyms_addition();
|
||||||
|
for synonym in synonyms {
|
||||||
|
match synonym {
|
||||||
|
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
|
||||||
|
let alternatives = match synonyms {
|
||||||
|
Synonyms::Multiple(alternatives) => alternatives,
|
||||||
|
Synonyms::Single(alternative) => vec![alternative],
|
||||||
|
};
|
||||||
|
synonyms_adder.add_synonym(search_terms, alternatives);
|
||||||
|
},
|
||||||
|
Synonym::MultiWay { mut synonyms } => {
|
||||||
|
for _ in 0..synonyms.len() {
|
||||||
|
if let Some((synonym, alternatives)) = synonyms.split_first() {
|
||||||
|
synonyms_adder.add_synonym(synonym, alternatives);
|
||||||
|
}
|
||||||
|
synonyms.rotate_left(1);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
synonyms_adder.finalize()?;
|
||||||
|
|
||||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||||
let mut raw_record = csv::StringRecord::new();
|
let mut raw_record = csv::StringRecord::new();
|
||||||
let headers = rdr.headers()?.clone();
|
let headers = rdr.headers()?.clone();
|
||||||
@ -133,13 +187,25 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
Schema::from_toml(file)?
|
Schema::from_toml(file)?
|
||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = match opt.stop_words_path {
|
let stop_words = match opt.stop_words {
|
||||||
Some(ref path) => retrieve_stop_words(path)?,
|
Some(ref path) => retrieve_stop_words(path)?,
|
||||||
None => HashSet::new(),
|
None => HashSet::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let synonyms = match opt.synonyms {
|
||||||
|
Some(ref path) => read_synomys(path)?,
|
||||||
|
None => Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
|
let result = index(
|
||||||
|
schema,
|
||||||
|
&opt.database_path,
|
||||||
|
&opt.csv_data_path,
|
||||||
|
opt.update_group_size,
|
||||||
|
&stop_words,
|
||||||
|
synonyms,
|
||||||
|
);
|
||||||
|
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
return Err(e.into())
|
return Err(e.into())
|
||||||
|
@ -2,17 +2,19 @@
|
|||||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
use std::collections::btree_map::{BTreeMap, Entry};
|
use std::collections::btree_map::{BTreeMap, Entry};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::HashSet;
|
||||||
use std::iter::FromIterator;
|
|
||||||
use std::io::{self, Write};
|
|
||||||
use std::time::{Instant, Duration};
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
use std::io::{self, Write};
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::time::{Instant, Duration};
|
||||||
|
|
||||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
use linked_hash_map::LinkedHashMap;
|
||||||
|
use rustyline::{Editor, Config};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use meilidb_core::Highlight;
|
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||||
|
|
||||||
|
use meilidb_core::Highlight;
|
||||||
use meilidb_data::Database;
|
use meilidb_data::Database;
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
|
|
||||||
@ -22,6 +24,9 @@ pub struct Opt {
|
|||||||
#[structopt(parse(from_os_str))]
|
#[structopt(parse(from_os_str))]
|
||||||
pub database_path: PathBuf,
|
pub database_path: PathBuf,
|
||||||
|
|
||||||
|
#[structopt(long = "fetch-timeout-ms")]
|
||||||
|
pub fetch_timeout_ms: Option<u64>,
|
||||||
|
|
||||||
/// Fields that must be displayed.
|
/// Fields that must be displayed.
|
||||||
pub displayed_fields: Vec<String>,
|
pub displayed_fields: Vec<String>,
|
||||||
|
|
||||||
@ -34,7 +39,7 @@ pub struct Opt {
|
|||||||
pub char_context: usize,
|
pub char_context: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
type Document = HashMap<String, String>;
|
type Document = LinkedHashMap<String, String>;
|
||||||
|
|
||||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||||
@ -140,9 +145,6 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let database = Database::start_default(&opt.database_path)?;
|
let database = Database::start_default(&opt.database_path)?;
|
||||||
|
|
||||||
let mut buffer = String::new();
|
|
||||||
let input = io::stdin();
|
|
||||||
|
|
||||||
let index = database.open_index("test")?.unwrap();
|
let index = database.open_index("test")?.unwrap();
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
|
|
||||||
@ -151,65 +153,77 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
let fields = opt.displayed_fields.iter().map(String::as_str);
|
let fields = opt.displayed_fields.iter().map(String::as_str);
|
||||||
let fields = HashSet::from_iter(fields);
|
let fields = HashSet::from_iter(fields);
|
||||||
|
|
||||||
loop {
|
let config = Config::builder().auto_add_history(true).build();
|
||||||
print!("Searching for: ");
|
let mut readline = Editor::<()>::with_config(config);
|
||||||
io::stdout().flush()?;
|
let _ = readline.load_history("query-history.txt");
|
||||||
|
|
||||||
if input.read_line(&mut buffer)? == 0 { break }
|
for result in readline.iter("Searching for: ") {
|
||||||
let query = buffer.trim_end_matches('\n');
|
match result {
|
||||||
|
Ok(query) => {
|
||||||
|
let start_total = Instant::now();
|
||||||
|
|
||||||
let start_total = Instant::now();
|
let builder = match opt.fetch_timeout_ms {
|
||||||
|
Some(timeout_ms) => {
|
||||||
|
let timeout = Duration::from_millis(timeout_ms);
|
||||||
|
index.query_builder().with_fetch_timeout(timeout)
|
||||||
|
},
|
||||||
|
None => index.query_builder(),
|
||||||
|
};
|
||||||
|
let documents = builder.query(&query, 0..opt.number_results)?;
|
||||||
|
|
||||||
let builder = index.query_builder();
|
let mut retrieve_duration = Duration::default();
|
||||||
let documents = builder.query(query, 0..opt.number_results)?;
|
|
||||||
|
|
||||||
let mut retrieve_duration = Duration::default();
|
let number_of_documents = documents.len();
|
||||||
|
for mut doc in documents {
|
||||||
|
|
||||||
let number_of_documents = documents.len();
|
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||||
for mut doc in documents {
|
|
||||||
|
|
||||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
let start_retrieve = Instant::now();
|
||||||
|
let result = index.document::<Document>(Some(&fields), doc.id);
|
||||||
|
retrieve_duration += start_retrieve.elapsed();
|
||||||
|
|
||||||
let start_retrieve = Instant::now();
|
match result {
|
||||||
let result = index.document::<Document>(Some(&fields), doc.id);
|
Ok(Some(document)) => {
|
||||||
retrieve_duration += start_retrieve.elapsed();
|
for (name, text) in document {
|
||||||
|
print!("{}: ", name);
|
||||||
|
|
||||||
match result {
|
let attr = schema.attribute(&name).unwrap();
|
||||||
Ok(Some(document)) => {
|
let highlights = doc.highlights.iter()
|
||||||
for (name, text) in document {
|
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||||
print!("{}: ", name);
|
.cloned();
|
||||||
|
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
|
||||||
let attr = schema.attribute(&name).unwrap();
|
let areas = create_highlight_areas(&text, &highlights);
|
||||||
let highlights = doc.highlights.iter()
|
display_highlights(&text, &areas)?;
|
||||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
println!();
|
||||||
.cloned();
|
}
|
||||||
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
|
},
|
||||||
let areas = create_highlight_areas(&text, &highlights);
|
Ok(None) => eprintln!("missing document"),
|
||||||
display_highlights(&text, &areas)?;
|
Err(e) => eprintln!("{}", e),
|
||||||
println!();
|
|
||||||
}
|
}
|
||||||
},
|
|
||||||
Ok(None) => eprintln!("missing document"),
|
let mut matching_attributes = HashSet::new();
|
||||||
Err(e) => eprintln!("{}", e),
|
for highlight in doc.highlights {
|
||||||
|
let attr = SchemaAttr::new(highlight.attribute);
|
||||||
|
let name = schema.attribute_name(attr);
|
||||||
|
matching_attributes.insert(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||||
|
println!("matching in: {:?}", matching_attributes);
|
||||||
|
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
||||||
|
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
||||||
|
},
|
||||||
|
Err(err) => {
|
||||||
|
println!("Error: {:?}", err);
|
||||||
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut matching_attributes = HashSet::new();
|
|
||||||
for highlight in doc.highlights {
|
|
||||||
let attr = SchemaAttr::new(highlight.attribute);
|
|
||||||
let name = schema.attribute_name(attr);
|
|
||||||
matching_attributes.insert(name);
|
|
||||||
}
|
|
||||||
|
|
||||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
|
||||||
println!("matching in: {:?}", matching_attributes);
|
|
||||||
|
|
||||||
println!();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
|
||||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
|
||||||
buffer.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
readline.save_history("query-history.txt").unwrap();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user