mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 21:20:24 +01:00
Merge pull request #453 from meilisearch/introduce-query-tree
Introduce a query tree structure
This commit is contained in:
commit
69adb1d771
16
Cargo.lock
generated
16
Cargo.lock
generated
@ -799,6 +799,14 @@ dependencies = [
|
||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "intervaltree"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iovec"
|
||||
version = "0.1.4"
|
||||
@ -952,6 +960,7 @@ dependencies = [
|
||||
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"intervaltree 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1797,6 +1806,11 @@ dependencies = [
|
||||
"maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "sourcefile"
|
||||
version = "0.1.4"
|
||||
@ -2715,6 +2729,7 @@ dependencies = [
|
||||
"checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e"
|
||||
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
||||
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
||||
"checksum intervaltree 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8254add2ea664734c9d001f8151cc3d7696b135f7e40e5a2efa814a662cb3a44"
|
||||
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
||||
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
||||
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
||||
@ -2822,6 +2837,7 @@ dependencies = [
|
||||
"checksum slice-group-by 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb"
|
||||
"checksum slog 2.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1cc9c640a4adbfbcc11ffb95efe5aa7af7309e002adab54b185507dbf2377b99"
|
||||
"checksum smallvec 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "f7b0758c52e15a8b5e3691eae6cc559f08eee9406e548a4477ba4e67770a82b6"
|
||||
"checksum smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44e59e0c9fa00817912ae6e4e6e3c4fe04455e75699d06eedc7d85917ed8e8f4"
|
||||
"checksum sourcefile 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4bf77cb82ba8453b42b6ae1d692e4cdc92f9a47beaf89a847c8be83f4e328ad3"
|
||||
"checksum spin 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||
"checksum stdweb 0.4.20 (registry+https://github.com/rust-lang/crates.io-index)" = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"
|
||||
|
@ -17,7 +17,8 @@ env_logger = "0.7.0"
|
||||
fst = { version = "0.3.5", default-features = false }
|
||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||
heed = "0.6.1"
|
||||
itertools = "0.8.2" # kill me please
|
||||
intervaltree = "0.2.5"
|
||||
itertools = "0.8.2"
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
log = "0.4.8"
|
||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
|
||||
|
@ -1,13 +1,8 @@
|
||||
mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use meilisearch_tokenizer::is_cjk;
|
||||
|
||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
pub use self::query_enhancer::QueryEnhancerBuilder;
|
||||
|
||||
pub const NGRAMS: usize = 3;
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
@ -1,437 +0,0 @@
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
use std::ops::Range;
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false;
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original
|
||||
.map(AsRef::as_ref)
|
||||
.eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end {
|
||||
Equal
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
} else {
|
||||
Greater
|
||||
}
|
||||
});
|
||||
|
||||
let n = match element {
|
||||
Ok(n) => n,
|
||||
Err(n) => n,
|
||||
};
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..=query.len()).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder {
|
||||
query,
|
||||
origins,
|
||||
real_to_origin,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where
|
||||
T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
let interval_tree = FakeIntervalTree::new(self.real_to_origin);
|
||||
let mut table = Vec::new();
|
||||
|
||||
for real in 0.. {
|
||||
match replacement(&self.origins, &interval_tree, real) {
|
||||
Some(range) => table.push(range),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
QueryEnhancer { table }
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the query indices that represent this real query index.
|
||||
fn replacement(
|
||||
origins: &[usize],
|
||||
real_to_origin: &FakeIntervalTree,
|
||||
real: u32,
|
||||
) -> Option<Range<u32>>
|
||||
{
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) = real_to_origin.query(real)?;
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = origins[origin];
|
||||
let end = origins.get(new_origin + 1)?;
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Some(Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
})
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = origins[origin];
|
||||
|
||||
Some(Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct QueryEnhancer {
|
||||
table: Vec<Range<u32>>,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices that represent this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
self.table[real as usize].clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
@ -1,31 +1,27 @@
|
||||
use std::ops::Deref;
|
||||
use std::{cmp, fmt};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::time::Instant;
|
||||
use std::fmt;
|
||||
|
||||
use compact_arena::{SmallArena, Idx32, mk_arena};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use hashbrown::HashMap;
|
||||
use levenshtein_automata::DFA;
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||
use meilisearch_types::DocIndex;
|
||||
use sdset::{Set, SetBuf};
|
||||
use sdset::{Set, SetBuf, exponential_search};
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
|
||||
use crate::automaton::NGRAMS;
|
||||
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
|
||||
|
||||
use crate::criterion::{Criteria, Context, ContextMut};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::RawDocument;
|
||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||
use crate::{store, Document, DocumentId, MResult};
|
||||
use crate::query_tree::{create_query_tree, traverse_query_tree};
|
||||
use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey};
|
||||
use crate::query_tree::Context as QTContext;
|
||||
|
||||
pub fn bucket_sort<'c, FI>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
@ -38,6 +34,8 @@ pub fn bucket_sort<'c, FI>(
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
@ -60,42 +58,63 @@ where
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
synonyms_store,
|
||||
prefix_documents_cache_store,
|
||||
prefix_postings_lists_cache_store,
|
||||
);
|
||||
}
|
||||
|
||||
let (mut automatons, mut query_enhancer) =
|
||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
||||
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
debug!("{:?}", query_enhancer);
|
||||
let context = QTContext {
|
||||
words_set,
|
||||
synonyms: synonyms_store,
|
||||
postings_lists: postings_lists_store,
|
||||
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||
};
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches =
|
||||
fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
||||
bare_matches.len(),
|
||||
before_postings_lists_fetching.elapsed(),
|
||||
);
|
||||
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
||||
debug!("operation:\n{:?}", operation);
|
||||
debug!("mapping:\n{:?}", mapping);
|
||||
|
||||
let before_raw_documents_presort = Instant::now();
|
||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut prefiltered_documents = 0;
|
||||
let mut raw_documents = Vec::new();
|
||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
prefiltered_documents += 1;
|
||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
||||
raw_documents.push(raw_document);
|
||||
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
||||
match operation {
|
||||
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
||||
}
|
||||
}
|
||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
||||
|
||||
let mut queries_kinds = HashMap::new();
|
||||
recurs_operation(&mut queries_kinds, &operation);
|
||||
|
||||
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
||||
debug!("found {} documents", docids.len());
|
||||
debug!("number of postings {:?}", queries.len());
|
||||
|
||||
let before = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
||||
debug!("matches cleaned in {:.02?}", before.elapsed());
|
||||
|
||||
let before_bucket_sort = Instant::now();
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut raw_documents = Vec::new();
|
||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||
raw_documents.push(raw_document);
|
||||
}
|
||||
debug!("creating {} candidates documents took {:.02?}",
|
||||
raw_documents.len(),
|
||||
prefiltered_documents,
|
||||
before_raw_documents_building.elapsed(),
|
||||
);
|
||||
|
||||
let before_criterion_loop = Instant::now();
|
||||
let proximity_count = AtomicUsize::new(0);
|
||||
|
||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||
|
||||
'criteria: for criterion in criteria.as_ref() {
|
||||
@ -108,8 +127,7 @@ where
|
||||
let ctx = ContextMut {
|
||||
reader,
|
||||
postings_lists: &mut arena,
|
||||
query_enhancer: &mut query_enhancer,
|
||||
automatons: &mut automatons,
|
||||
query_mapping: &mapping,
|
||||
documents_fields_counts_store,
|
||||
};
|
||||
|
||||
@ -118,8 +136,7 @@ where
|
||||
|
||||
let ctx = Context {
|
||||
postings_lists: &arena,
|
||||
query_enhancer: &query_enhancer,
|
||||
automatons: &automatons,
|
||||
query_mapping: &mapping,
|
||||
};
|
||||
|
||||
let before_criterion_sort = Instant::now();
|
||||
@ -141,10 +158,16 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||
let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref()));
|
||||
debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed());
|
||||
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
|
||||
|
||||
Ok(iter.collect())
|
||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||
let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref()));
|
||||
let documents = iter.collect();
|
||||
|
||||
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
||||
@ -160,38 +183,57 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
_prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<u64>,
|
||||
{
|
||||
let (mut automatons, mut query_enhancer) =
|
||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
||||
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
||||
bare_matches.len(),
|
||||
before_postings_lists_fetching.elapsed(),
|
||||
);
|
||||
let context = QTContext {
|
||||
words_set,
|
||||
synonyms: synonyms_store,
|
||||
postings_lists: postings_lists_store,
|
||||
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||
};
|
||||
|
||||
let before_raw_documents_presort = Instant::now();
|
||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
||||
debug!("operation:\n{:?}", operation);
|
||||
debug!("mapping:\n{:?}", mapping);
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut prefiltered_documents = 0;
|
||||
let mut raw_documents = Vec::new();
|
||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
prefiltered_documents += 1;
|
||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
||||
raw_documents.push(raw_document);
|
||||
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
||||
match operation {
|
||||
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
||||
}
|
||||
}
|
||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
||||
|
||||
let mut queries_kinds = HashMap::new();
|
||||
recurs_operation(&mut queries_kinds, &operation);
|
||||
|
||||
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
||||
debug!("found {} documents", docids.len());
|
||||
debug!("number of postings {:?}", queries.len());
|
||||
|
||||
let before = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
||||
debug!("matches cleaned in {:.02?}", before.elapsed());
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut raw_documents = Vec::new();
|
||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||
raw_documents.push(raw_document);
|
||||
}
|
||||
debug!("creating {} candidates documents took {:.02?}",
|
||||
raw_documents.len(),
|
||||
prefiltered_documents,
|
||||
before_raw_documents_building.elapsed(),
|
||||
);
|
||||
|
||||
@ -222,8 +264,7 @@ where
|
||||
let ctx = ContextMut {
|
||||
reader,
|
||||
postings_lists: &mut arena,
|
||||
query_enhancer: &mut query_enhancer,
|
||||
automatons: &mut automatons,
|
||||
query_mapping: &mapping,
|
||||
documents_fields_counts_store,
|
||||
};
|
||||
|
||||
@ -233,8 +274,7 @@ where
|
||||
|
||||
let ctx = Context {
|
||||
postings_lists: &arena,
|
||||
query_enhancer: &query_enhancer,
|
||||
automatons: &automatons,
|
||||
query_mapping: &mapping,
|
||||
};
|
||||
|
||||
let before_criterion_sort = Instant::now();
|
||||
@ -306,7 +346,7 @@ where
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref()));
|
||||
documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref()));
|
||||
if documents.len() == range.len() {
|
||||
break;
|
||||
}
|
||||
@ -317,9 +357,82 @@ where
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
fn cleanup_bare_matches<'tag, 'txn>(
|
||||
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
docids: &Set<DocumentId>,
|
||||
queries: HashMap<PostingsKey, Cow<'txn, Set<DocIndex>>>,
|
||||
) -> Vec<BareMatch<'tag>>
|
||||
{
|
||||
let docidslen = docids.len() as f32;
|
||||
let mut bare_matches = Vec::new();
|
||||
|
||||
for (PostingsKey { query, input, distance, is_exact }, matches) in queries {
|
||||
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
||||
let pllen = postings_list_view.len() as f32;
|
||||
|
||||
if docidslen / pllen >= 0.8 {
|
||||
let mut offset = 0;
|
||||
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
|
||||
let document_id = matches[0].document_id;
|
||||
if docids.contains(&document_id) {
|
||||
let range = postings_list_view.range(offset, matches.len());
|
||||
let posting_list_index = arena.add(range);
|
||||
|
||||
let bare_match = BareMatch {
|
||||
document_id,
|
||||
query_index: query.id,
|
||||
distance,
|
||||
is_exact,
|
||||
postings_list: posting_list_index,
|
||||
};
|
||||
|
||||
bare_matches.push(bare_match);
|
||||
}
|
||||
|
||||
offset += matches.len();
|
||||
}
|
||||
|
||||
} else {
|
||||
let mut offset = 0;
|
||||
for id in docids.as_slice() {
|
||||
let di = DocIndex { document_id: *id, ..DocIndex::default() };
|
||||
let pos = exponential_search(&postings_list_view[offset..], &di).unwrap_or_else(|x| x);
|
||||
|
||||
offset += pos;
|
||||
|
||||
let group = postings_list_view[offset..]
|
||||
.linear_group_by_key(|m| m.document_id)
|
||||
.next()
|
||||
.filter(|matches| matches[0].document_id == *id);
|
||||
|
||||
if let Some(matches) = group {
|
||||
let range = postings_list_view.range(offset, matches.len());
|
||||
let posting_list_index = arena.add(range);
|
||||
|
||||
let bare_match = BareMatch {
|
||||
document_id: *id,
|
||||
query_index: query.id,
|
||||
distance,
|
||||
is_exact,
|
||||
postings_list: posting_list_index,
|
||||
};
|
||||
|
||||
bare_matches.push(bare_match);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let before_raw_documents_presort = Instant::now();
|
||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
|
||||
bare_matches
|
||||
}
|
||||
|
||||
pub struct BareMatch<'tag> {
|
||||
pub document_id: DocumentId,
|
||||
pub query_index: u16,
|
||||
pub query_index: usize,
|
||||
pub distance: u8,
|
||||
pub is_exact: bool,
|
||||
pub postings_list: Idx32<'tag>,
|
||||
@ -338,7 +451,7 @@ impl fmt::Debug for BareMatch<'_> {
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SimpleMatch {
|
||||
pub query_index: u16,
|
||||
pub query_index: usize,
|
||||
pub distance: u8,
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
@ -436,285 +549,3 @@ impl Deref for PostingsListView<'_> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matches<'txn, 'tag>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Vec<BareMatch<'tag>>>
|
||||
{
|
||||
let before_words_fst = Instant::now();
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
debug!("words fst took {:.02?}", before_words_fst.elapsed());
|
||||
debug!("words fst len {} and size {}", words.len(), words.as_fst().as_bytes().len());
|
||||
|
||||
let mut total_postings_lists = Vec::new();
|
||||
|
||||
let mut dfa_time = Duration::default();
|
||||
let mut stream_next_time = Duration::default();
|
||||
let mut postings_lists_fetching_time = Duration::default();
|
||||
let automatons_loop = Instant::now();
|
||||
|
||||
for (query_index, automaton) in automatons.iter().enumerate() {
|
||||
let before_dfa = Instant::now();
|
||||
let dfa = automaton.dfa();
|
||||
let QueryWordAutomaton { query, is_exact, .. } = automaton;
|
||||
dfa_time += before_dfa.elapsed();
|
||||
|
||||
let mut number_of_words = 0;
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
|
||||
// while let Some(input) = stream.next() {
|
||||
loop {
|
||||
let before_stream_next = Instant::now();
|
||||
let value = stream.next();
|
||||
stream_next_time += before_stream_next.elapsed();
|
||||
|
||||
let input = match value {
|
||||
Some(input) => input,
|
||||
None => break,
|
||||
};
|
||||
|
||||
number_of_words += 1;
|
||||
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
|
||||
let input = Rc::from(input);
|
||||
let postings_list = Rc::new(postings_list);
|
||||
let postings_list_view = PostingsListView::original(input, postings_list);
|
||||
|
||||
let mut offset = 0;
|
||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
||||
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
|
||||
let document_id = group[0].document_id;
|
||||
let bare_match = BareMatch {
|
||||
document_id,
|
||||
query_index: query_index as u16,
|
||||
distance,
|
||||
is_exact,
|
||||
postings_list: posting_list_index,
|
||||
};
|
||||
|
||||
total_postings_lists.push(bare_match);
|
||||
offset += group.len();
|
||||
}
|
||||
}
|
||||
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
|
||||
}
|
||||
|
||||
debug!("{:?} gives {} words", query, number_of_words);
|
||||
}
|
||||
|
||||
debug!("automatons loop took {:.02?}", automatons_loop.elapsed());
|
||||
debug!("stream next took {:.02?}", stream_next_time);
|
||||
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
|
||||
debug!("dfa creation took {:.02?}", dfa_time);
|
||||
|
||||
Ok(total_postings_lists)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct QueryWordAutomaton {
|
||||
pub query: String,
|
||||
/// Is it a word that must be considered exact
|
||||
/// or is it some derived word (i.e. a synonym)
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
/// If it's a phrase query and what is
|
||||
/// its index an the length of the phrase
|
||||
pub phrase_query: Option<(u16, u16)>,
|
||||
}
|
||||
|
||||
impl QueryWordAutomaton {
|
||||
pub fn exact(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn non_exact(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.phrase_query.is_some() {
|
||||
build_exact_dfa(&self.query)
|
||||
} else if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn construct_automatons(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
Some(synonym) => synonym,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let mut automaton_index = 0;
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
QueryWordAutomaton::exact(word)
|
||||
} else {
|
||||
QueryWordAutomaton::exact_prefix(word)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
QueryWordAutomaton::exact(synonym)
|
||||
} else {
|
||||
QueryWordAutomaton::non_exact(synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
// automatons for splitted words
|
||||
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
||||
let mut left_automaton = QueryWordAutomaton::exact(left);
|
||||
left_automaton.phrase_query = Some((0, 2));
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
automatons.push(left_automaton);
|
||||
|
||||
let mut right_automaton = QueryWordAutomaton::exact(right);
|
||||
right_automaton.phrase_query = Some((1, 2));
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
|
||||
automaton_index += 1;
|
||||
automatons.push(right_automaton);
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = QueryWordAutomaton::exact(&normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
||||
|
@ -9,13 +9,13 @@ pub struct Attribute;
|
||||
impl Criterion for Attribute {
|
||||
fn name(&self) -> &str { "attribute" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -11,9 +11,9 @@ pub struct Exact;
|
||||
impl Criterion for Exact {
|
||||
fn name(&self) -> &str { "exact" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
|
@ -1,13 +1,15 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::{store, RawDocument, MResult};
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
||||
use crate::bucket_sort::{SimpleMatch, PostingsListView};
|
||||
use crate::database::MainT;
|
||||
use crate::query_tree::QueryId;
|
||||
use crate::{store, RawDocument, MResult};
|
||||
|
||||
mod typo;
|
||||
mod words;
|
||||
@ -30,26 +32,26 @@ pub use self::sort_by_attr::SortByAttr;
|
||||
pub trait Criterion {
|
||||
fn name(&self) -> &str;
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
_documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||
lhs: &RawDocument<'r, 'tag>,
|
||||
rhs: &RawDocument<'r, 'tag>,
|
||||
) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn eq<'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||
lhs: &RawDocument<'r, 'tag>,
|
||||
rhs: &RawDocument<'r, 'tag>,
|
||||
) -> bool
|
||||
@ -58,18 +60,16 @@ pub trait Criterion {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> {
|
||||
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
|
||||
pub reader: &'h heed::RoTxn<MainT>,
|
||||
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
pub query_enhancer: &'q mut QueryEnhancer,
|
||||
pub automatons: &'a mut [QueryWordAutomaton],
|
||||
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
}
|
||||
|
||||
pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
|
||||
pub struct Context<'p, 'tag, 'txn, 'q> {
|
||||
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
||||
pub query_enhancer: &'q QueryEnhancer,
|
||||
pub automatons: &'a [QueryWordAutomaton],
|
||||
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@ -138,7 +138,7 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
||||
|
||||
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) {
|
||||
for document in documents {
|
||||
@ -148,7 +148,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||
for m in document.bare_matches.iter() {
|
||||
if postings_lists[m.postings_list].is_empty() { continue }
|
||||
|
||||
let range = query_enhancer.replacement(m.query_index as u32);
|
||||
let range = query_mapping[&(m.query_index as usize)].clone();
|
||||
let new_len = cmp::max(range.end as usize, processed.len());
|
||||
processed.resize(new_len, None);
|
||||
|
||||
@ -169,7 +169,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||
) {
|
||||
for document in documents {
|
||||
if !document.processed_matches.is_empty() { continue }
|
||||
@ -190,14 +190,14 @@ fn prepare_bare_matches<'a, 'tag, 'txn>(
|
||||
}
|
||||
}
|
||||
|
||||
let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
|
||||
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
|
||||
document.processed_matches = processed.into_vec();
|
||||
}
|
||||
}
|
||||
|
||||
fn multiword_rewrite_matches(
|
||||
matches: &mut [SimpleMatch],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||
) -> SetBuf<SimpleMatch>
|
||||
{
|
||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||
@ -218,13 +218,12 @@ fn multiword_rewrite_matches(
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for match_ in same_word_index {
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
||||
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
@ -236,20 +235,17 @@ fn multiword_rewrite_matches(
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
|
||||
for nmatch_ in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
||||
let query_index = rep.next().unwrap() as u16;
|
||||
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
|
||||
let query_index = rep.next().unwrap();
|
||||
if query_index == padmatch.query_index {
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
biggest = biggest.max(i + 1);
|
||||
@ -273,7 +269,6 @@ fn multiword_rewrite_matches(
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
|
@ -11,13 +11,13 @@ pub struct Proximity;
|
||||
impl Criterion for Proximity {
|
||||
fn name(&self) -> &str { "proximity" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -7,13 +7,13 @@ pub struct Typo;
|
||||
impl Criterion for Typo {
|
||||
fn name(&self) -> &str { "typo" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
||||
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -7,13 +7,13 @@ pub struct Words;
|
||||
impl Criterion for Words {
|
||||
fn name(&self) -> &str { "words" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
||||
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -9,13 +9,13 @@ pub struct WordsPosition;
|
||||
impl Criterion for WordsPosition {
|
||||
fn name(&self) -> &str { "words position" }
|
||||
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) -> MResult<()>
|
||||
{
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -141,13 +141,13 @@ impl Database {
|
||||
|
||||
fs::create_dir_all(&main_path)?;
|
||||
let env = heed::EnvOpenOptions::new()
|
||||
.map_size(10 * 1024 * 1024 * 1024) // 10GB
|
||||
.map_size(100 * 1024 * 1024 * 1024) // 100GB
|
||||
.max_dbs(3000)
|
||||
.open(main_path)?;
|
||||
|
||||
fs::create_dir_all(&update_path)?;
|
||||
let update_env = heed::EnvOpenOptions::new()
|
||||
.map_size(10 * 1024 * 1024 * 1024) // 10GB
|
||||
.map_size(100 * 1024 * 1024 * 1024) // 100GB
|
||||
.max_dbs(3000)
|
||||
.open(update_path)?;
|
||||
|
||||
|
@ -10,6 +10,8 @@ mod error;
|
||||
mod levenshtein;
|
||||
mod number;
|
||||
mod query_builder;
|
||||
mod query_tree;
|
||||
mod query_words_mapper;
|
||||
mod ranked_map;
|
||||
mod raw_document;
|
||||
mod reordered_attrs;
|
||||
@ -27,10 +29,15 @@ pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Index;
|
||||
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
||||
pub use query_words_mapper::QueryWordsMapper;
|
||||
|
||||
use std::convert::TryFrom;
|
||||
use std::collections::HashMap;
|
||||
use compact_arena::SmallArena;
|
||||
use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
|
||||
|
||||
use crate::bucket_sort::PostingsListView;
|
||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||
use crate::query_tree::{QueryId, QueryKind};
|
||||
use crate::reordered_attrs::ReorderedAttrs;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@ -44,7 +51,7 @@ pub struct Document {
|
||||
|
||||
fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
||||
raw_document: &RawDocument<'a, 'tag>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
searchable_attrs: Option<&ReorderedAttrs>,
|
||||
) -> Vec<Highlight>
|
||||
@ -54,13 +61,19 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
||||
for bm in raw_document.bare_matches.iter() {
|
||||
let postings_list = &arena[bm.postings_list];
|
||||
let input = postings_list.input();
|
||||
let query = &automatons[bm.query_index as usize].query;
|
||||
let kind = &queries_kinds.get(&bm.query_index);
|
||||
|
||||
for di in postings_list.iter() {
|
||||
let covered_area = if query.len() > input.len() {
|
||||
input.len()
|
||||
} else {
|
||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||
let covered_area = match kind {
|
||||
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
|
||||
let len = if query.len() > input.len() {
|
||||
input.len()
|
||||
} else {
|
||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||
};
|
||||
u16::try_from(len).unwrap_or(u16::max_value())
|
||||
},
|
||||
_ => di.char_length,
|
||||
};
|
||||
|
||||
let attribute = searchable_attrs
|
||||
@ -70,7 +83,7 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
||||
let highlight = Highlight {
|
||||
attribute: attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: covered_area as u16,
|
||||
char_length: covered_area,
|
||||
};
|
||||
|
||||
highlights.push(highlight);
|
||||
@ -81,17 +94,27 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
||||
}
|
||||
|
||||
impl Document {
|
||||
#[cfg(not(test))]
|
||||
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
||||
Document { id, highlights: highlights.to_owned() }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
||||
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub fn from_raw<'a, 'tag, 'txn>(
|
||||
raw_document: RawDocument<'a, 'tag>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
searchable_attrs: Option<&ReorderedAttrs>,
|
||||
) -> Document
|
||||
{
|
||||
let highlights = highlights_from_raw_document(
|
||||
&raw_document,
|
||||
automatons,
|
||||
queries_kinds,
|
||||
arena,
|
||||
searchable_attrs,
|
||||
);
|
||||
@ -102,7 +125,7 @@ impl Document {
|
||||
#[cfg(test)]
|
||||
pub fn from_raw<'a, 'tag, 'txn>(
|
||||
raw_document: RawDocument<'a, 'tag>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
searchable_attrs: Option<&ReorderedAttrs>,
|
||||
) -> Document
|
||||
@ -111,7 +134,7 @@ impl Document {
|
||||
|
||||
let highlights = highlights_from_raw_document(
|
||||
&raw_document,
|
||||
automatons,
|
||||
queries_kinds,
|
||||
arena,
|
||||
searchable_attrs,
|
||||
);
|
||||
|
@ -16,6 +16,8 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||
}
|
||||
|
||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
@ -24,12 +26,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
postings_lists: store::PostingsLists,
|
||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||
synonyms: store::Synonyms,
|
||||
prefix_documents_cache: store::PrefixDocumentsCache,
|
||||
prefix_postings_lists_cache: store::PrefixPostingsListsCache,
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder::with_criteria(
|
||||
main,
|
||||
postings_lists,
|
||||
documents_fields_counts,
|
||||
synonyms,
|
||||
prefix_documents_cache,
|
||||
prefix_postings_lists_cache,
|
||||
Criteria::default(),
|
||||
)
|
||||
}
|
||||
@ -39,6 +45,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
postings_lists: store::PostingsLists,
|
||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||
synonyms: store::Synonyms,
|
||||
prefix_documents_cache: store::PrefixDocumentsCache,
|
||||
prefix_postings_lists_cache: store::PrefixPostingsListsCache,
|
||||
criteria: Criteria<'c>,
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder {
|
||||
@ -51,6 +59,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
postings_lists_store: postings_lists,
|
||||
documents_fields_counts_store: documents_fields_counts,
|
||||
synonyms_store: synonyms,
|
||||
prefix_documents_cache_store: prefix_documents_cache,
|
||||
prefix_postings_lists_cache_store: prefix_postings_lists_cache,
|
||||
}
|
||||
}
|
||||
|
||||
@ -97,6 +107,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
self.prefix_documents_cache_store,
|
||||
self.prefix_postings_lists_cache_store,
|
||||
),
|
||||
None => bucket_sort(
|
||||
reader,
|
||||
@ -109,6 +121,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
self.prefix_documents_cache_store,
|
||||
self.prefix_postings_lists_cache_store,
|
||||
),
|
||||
}
|
||||
}
|
||||
@ -206,7 +220,7 @@ mod tests {
|
||||
let db = &self.database;
|
||||
let mut writer = db.main_write_txn().unwrap();
|
||||
|
||||
let word = word.to_lowercase();
|
||||
let word = normalize_str(word);
|
||||
|
||||
let alternatives = match self
|
||||
.index
|
||||
@ -355,82 +369,82 @@ mod tests {
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
// #[test]
|
||||
// fn prefix_synonyms() {
|
||||
// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||
// store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
// store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
let db = &store.database;
|
||||
let reader = db.main_read_txn().unwrap();
|
||||
// let db = &store.database;
|
||||
// let reader = db.main_read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "sal", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "sal", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
// let mut matches = matches.into_iter();
|
||||
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
// assert_matches!(matches.next(), None);
|
||||
// });
|
||||
// assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "bonj", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "bonj", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
// let mut matches = matches.into_iter();
|
||||
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
// assert_matches!(matches.next(), None);
|
||||
// });
|
||||
// assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "sal blabla", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "sal blabla", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), None);
|
||||
// assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "bonj blabla", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "bonj blabla", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
// assert_matches!(iter.next(), None);
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn levenshtein_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
// #[test]
|
||||
// fn levenshtein_synonyms() {
|
||||
// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
|
||||
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||
// store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
let db = &store.database;
|
||||
let reader = db.main_read_txn().unwrap();
|
||||
// let db = &store.database;
|
||||
// let reader = db.main_read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "salutution", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "salutution", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
// let mut matches = matches.into_iter();
|
||||
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
// assert_matches!(matches.next(), None);
|
||||
// });
|
||||
// assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "saluttion", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
// let builder = store.query_builder();
|
||||
// let results = builder.query(&reader, "saluttion", 0..20).unwrap();
|
||||
// let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
// let mut matches = matches.into_iter();
|
||||
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||
// assert_matches!(matches.next(), None);
|
||||
// });
|
||||
// assert_matches!(iter.next(), None);
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn harder_synonyms() {
|
||||
@ -541,19 +555,19 @@ mod tests {
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
@ -563,19 +577,19 @@ mod tests {
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
@ -667,11 +681,11 @@ mod tests {
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
// assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
// let mut matches = matches.into_iter();
|
||||
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway
|
||||
// assert_matches!(matches.next(), None);
|
||||
// });
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
@ -731,7 +745,7 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
@ -739,7 +753,7 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
@ -811,15 +825,6 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
@ -831,19 +836,19 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC
|
||||
// because one-word to one-word ^^^^
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway
|
||||
assert_matches!(iter.next(), None);
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
// because one-word to one-word ^^^^
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
@ -906,15 +911,6 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
@ -929,29 +925,18 @@ mod tests {
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
@ -978,15 +963,12 @@ mod tests {
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city
|
||||
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big
|
||||
assert_matches!(matches.next(), None);
|
||||
});
|
||||
@ -1017,7 +999,7 @@ mod tests {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(matches.next(), None);
|
||||
@ -1025,9 +1007,9 @@ mod tests {
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||
let mut matches = matches.into_iter();
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
||||
assert_matches!(matches.next(), None);
|
||||
@ -1161,7 +1143,8 @@ mod tests {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone
|
||||
// assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone"
|
||||
// but no typo on first letter ^^^^^^^
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
@ -1271,73 +1254,4 @@ mod tests {
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn searchable_attributes() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("search", &[doc_attr_index(0, 0, 0)][..]),
|
||||
("engine", &[doc_attr_index(0, 0, 1)][..]),
|
||||
|
||||
("search", &[doc_attr_index(1, 1, 0)][..]),
|
||||
("engine", &[doc_attr_index(1, 1, 1)][..]),
|
||||
]);
|
||||
|
||||
let db = &store.database;
|
||||
let reader = db.main_read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
// reorderer the searchable attributes
|
||||
let mut builder = store.query_builder();
|
||||
builder.add_searchable_attribute(1);
|
||||
builder.add_searchable_attribute(0);
|
||||
|
||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
// remove a searchable attributes
|
||||
let mut builder = store.query_builder();
|
||||
builder.add_searchable_attribute(1);
|
||||
|
||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
|
558
meilisearch-core/src/query_tree.rs
Normal file
558
meilisearch-core/src/query_tree.rs
Normal file
@ -0,0 +1,558 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::ops::Range;
|
||||
use std::time::Instant;
|
||||
use std::{cmp, fmt, iter::once};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use itertools::{EitherOrBoth, merge_join_by};
|
||||
use meilisearch_tokenizer::split_query_string;
|
||||
use sdset::{Set, SetBuf, SetOperation};
|
||||
use log::debug;
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::{store, DocumentId, DocIndex, MResult};
|
||||
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
use crate::QueryWordsMapper;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Operation {
|
||||
And(Vec<Operation>),
|
||||
Or(Vec<Operation>),
|
||||
Query(Query),
|
||||
}
|
||||
|
||||
impl fmt::Debug for Operation {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
|
||||
match op {
|
||||
Operation::And(children) => {
|
||||
writeln!(f, "{:1$}AND", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
},
|
||||
Operation::Or(children) => {
|
||||
writeln!(f, "{:1$}OR", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
},
|
||||
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
|
||||
}
|
||||
}
|
||||
|
||||
pprint_tree(f, self, 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Operation {
|
||||
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::Tolerant(s.to_string()) })
|
||||
}
|
||||
|
||||
fn non_tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::NonTolerant(s.to_string()) })
|
||||
}
|
||||
|
||||
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
|
||||
let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
|
||||
Operation::Query(Query { id, prefix, exact: true, kind })
|
||||
}
|
||||
}
|
||||
|
||||
pub type QueryId = usize;
|
||||
|
||||
#[derive(Clone, Eq)]
|
||||
pub struct Query {
|
||||
pub id: QueryId,
|
||||
pub prefix: bool,
|
||||
pub exact: bool,
|
||||
pub kind: QueryKind,
|
||||
}
|
||||
|
||||
impl PartialEq for Query {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.prefix == other.prefix && self.kind == other.kind
|
||||
}
|
||||
}
|
||||
|
||||
impl Hash for Query {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.prefix.hash(state);
|
||||
self.kind.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum QueryKind {
|
||||
Tolerant(String),
|
||||
NonTolerant(String),
|
||||
Phrase(Vec<String>),
|
||||
}
|
||||
|
||||
impl fmt::Debug for Query {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let Query { id, prefix, kind, .. } = self;
|
||||
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
|
||||
match kind {
|
||||
QueryKind::NonTolerant(word) => {
|
||||
f.debug_struct(&(prefix + "NonTolerant")).field("id", &id).field("word", &word).finish()
|
||||
},
|
||||
QueryKind::Tolerant(word) => {
|
||||
f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish()
|
||||
},
|
||||
QueryKind::Phrase(words) => {
|
||||
f.debug_struct(&(prefix + "Phrase")).field("id", &id).field("words", &words).finish()
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PostingsList {
|
||||
docids: SetBuf<DocumentId>,
|
||||
matches: SetBuf<DocIndex>,
|
||||
}
|
||||
|
||||
pub struct Context {
|
||||
pub words_set: fst::Set,
|
||||
pub synonyms: store::Synonyms,
|
||||
pub postings_lists: store::PostingsLists,
|
||||
pub prefix_postings_lists: store::PrefixPostingsListsCache,
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'a str) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = ctx.postings_lists
|
||||
.postings_list(reader, left.as_bytes())?
|
||||
.map(|p| p.docids.len())
|
||||
.unwrap_or(0);
|
||||
let right_freq = ctx.postings_lists
|
||||
.postings_list(reader, right.as_bytes())?
|
||||
.map(|p| p.docids.len())
|
||||
.unwrap_or(0);
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||
let words = normalize_str(&words.join(" "));
|
||||
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
|
||||
|
||||
let mut strings = Vec::new();
|
||||
let mut stream = set.stream();
|
||||
while let Some(input) = stream.next() {
|
||||
if let Ok(input) = std::str::from_utf8(input) {
|
||||
let alts = input.split_ascii_whitespace().map(ToOwned::to_owned).collect();
|
||||
strings.push(alts);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(strings)
|
||||
}
|
||||
|
||||
fn create_operation<I, F>(iter: I, f: F) -> Operation
|
||||
where I: IntoIterator<Item=Operation>,
|
||||
F: Fn(Vec<Operation>) -> Operation,
|
||||
{
|
||||
let mut iter = iter.into_iter();
|
||||
match (iter.next(), iter.next()) {
|
||||
(Some(first), None) => first,
|
||||
(first, second) => f(first.into_iter().chain(second).chain(iter).collect()),
|
||||
}
|
||||
}
|
||||
|
||||
const MAX_NGRAM: usize = 3;
|
||||
|
||||
pub fn create_query_tree(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
query: &str,
|
||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||
{
|
||||
let words = split_query_string(query).map(str::to_lowercase);
|
||||
let words: Vec<_> = words.into_iter().enumerate().collect();
|
||||
|
||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||
|
||||
fn create_inner(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
mapper: &mut QueryWordsMapper,
|
||||
words: &[(usize, String)],
|
||||
) -> MResult<Vec<Operation>>
|
||||
{
|
||||
let mut alts = Vec::new();
|
||||
|
||||
for ngram in 1..=MAX_NGRAM {
|
||||
if let Some(group) = words.get(..ngram) {
|
||||
let mut group_ops = Vec::new();
|
||||
|
||||
let tail = &words[ngram..];
|
||||
let is_last = tail.is_empty();
|
||||
|
||||
let mut group_alts = Vec::new();
|
||||
match group {
|
||||
[(id, word)] => {
|
||||
let mut idgen = ((id + 1) * 100)..;
|
||||
let range = (*id)..id+1;
|
||||
|
||||
let phrase = split_best_frequency(reader, ctx, word)?
|
||||
.map(|ws| {
|
||||
let id = idgen.next().unwrap();
|
||||
idgen.next().unwrap();
|
||||
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
|
||||
Operation::phrase2(id, is_last, ws)
|
||||
});
|
||||
|
||||
let synonyms = fetch_synonyms(reader, ctx, &[word])?
|
||||
.into_iter()
|
||||
.map(|alts| {
|
||||
let exact = alts.len() == 1;
|
||||
let id = idgen.next().unwrap();
|
||||
mapper.declare(range.clone(), id, &alts);
|
||||
|
||||
let mut idgen = once(id).chain(&mut idgen);
|
||||
let iter = alts.into_iter().map(|w| {
|
||||
let id = idgen.next().unwrap();
|
||||
let kind = QueryKind::NonTolerant(w);
|
||||
Operation::Query(Query { id, prefix: false, exact, kind })
|
||||
});
|
||||
|
||||
create_operation(iter, Operation::And)
|
||||
});
|
||||
|
||||
let original = Operation::tolerant(*id, is_last, word);
|
||||
|
||||
group_alts.push(original);
|
||||
group_alts.extend(synonyms.chain(phrase));
|
||||
},
|
||||
words => {
|
||||
let id = words[0].0;
|
||||
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
|
||||
let range = id..id+ngram;
|
||||
|
||||
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
||||
|
||||
for synonym in fetch_synonyms(reader, ctx, &words)? {
|
||||
let exact = synonym.len() == 1;
|
||||
let id = idgen.next().unwrap();
|
||||
mapper.declare(range.clone(), id, &synonym);
|
||||
|
||||
let mut idgen = once(id).chain(&mut idgen);
|
||||
let synonym = synonym.into_iter().map(|s| {
|
||||
let id = idgen.next().unwrap();
|
||||
let kind = QueryKind::NonTolerant(s);
|
||||
Operation::Query(Query { id, prefix: false, exact, kind })
|
||||
});
|
||||
group_alts.push(create_operation(synonym, Operation::And));
|
||||
}
|
||||
|
||||
let id = idgen.next().unwrap();
|
||||
let concat = words.concat();
|
||||
mapper.declare(range.clone(), id, &[&concat]);
|
||||
group_alts.push(Operation::non_tolerant(id, is_last, &concat));
|
||||
}
|
||||
}
|
||||
|
||||
group_ops.push(create_operation(group_alts, Operation::Or));
|
||||
|
||||
if !tail.is_empty() {
|
||||
let tail_ops = create_inner(reader, ctx, mapper, tail)?;
|
||||
group_ops.push(create_operation(tail_ops, Operation::Or));
|
||||
}
|
||||
|
||||
alts.push(create_operation(group_ops, Operation::And));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(alts)
|
||||
}
|
||||
|
||||
let alternatives = create_inner(reader, ctx, &mut mapper, &words)?;
|
||||
let operation = Operation::Or(alternatives);
|
||||
let mapping = mapper.mapping();
|
||||
|
||||
Ok((operation, mapping))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PostingsKey<'o> {
|
||||
pub query: &'o Query,
|
||||
pub input: Vec<u8>,
|
||||
pub distance: u8,
|
||||
pub is_exact: bool,
|
||||
}
|
||||
|
||||
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
|
||||
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
|
||||
|
||||
pub struct QueryResult<'o, 'txn> {
|
||||
pub docids: Cow<'txn, Set<DocumentId>>,
|
||||
pub queries: Postings<'o, 'txn>,
|
||||
}
|
||||
|
||||
pub fn traverse_query_tree<'o, 'txn>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
tree: &'o Operation,
|
||||
) -> MResult<QueryResult<'o, 'txn>>
|
||||
{
|
||||
fn execute_and<'o, 'txn>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
cache: &mut Cache<'o, 'txn>,
|
||||
postings: &mut Postings<'o, 'txn>,
|
||||
depth: usize,
|
||||
operations: &'o [Operation],
|
||||
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||
{
|
||||
debug!("{:1$}AND", "", depth * 2);
|
||||
|
||||
let before = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
|
||||
for op in operations {
|
||||
if cache.get(op).is_none() {
|
||||
let docids = match op {
|
||||
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||
};
|
||||
cache.insert(op, docids);
|
||||
}
|
||||
}
|
||||
|
||||
for op in operations {
|
||||
if let Some(docids) = cache.get(op) {
|
||||
results.push(docids.as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
let op = sdset::multi::Intersection::new(results);
|
||||
let docids = op.into_set_buf();
|
||||
|
||||
debug!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||
|
||||
Ok(Cow::Owned(docids))
|
||||
}
|
||||
|
||||
fn execute_or<'o, 'txn>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
cache: &mut Cache<'o, 'txn>,
|
||||
postings: &mut Postings<'o, 'txn>,
|
||||
depth: usize,
|
||||
operations: &'o [Operation],
|
||||
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||
{
|
||||
debug!("{:1$}OR", "", depth * 2);
|
||||
|
||||
let before = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
|
||||
for op in operations {
|
||||
if cache.get(op).is_none() {
|
||||
let docids = match op {
|
||||
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||
};
|
||||
cache.insert(op, docids);
|
||||
}
|
||||
}
|
||||
|
||||
for op in operations {
|
||||
if let Some(docids) = cache.get(op) {
|
||||
results.push(docids.as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
let op = sdset::multi::Union::new(results);
|
||||
let docids = op.into_set_buf();
|
||||
|
||||
debug!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||
|
||||
Ok(Cow::Owned(docids))
|
||||
}
|
||||
|
||||
fn execute_query<'o, 'txn>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
ctx: &Context,
|
||||
postings: &mut Postings<'o, 'txn>,
|
||||
depth: usize,
|
||||
query: &'o Query,
|
||||
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||
{
|
||||
let before = Instant::now();
|
||||
|
||||
let Query { prefix, kind, exact, .. } = query;
|
||||
let docids: Cow<Set<_>> = match kind {
|
||||
QueryKind::Tolerant(word) => {
|
||||
if *prefix && word.len() <= 2 {
|
||||
let prefix = {
|
||||
let mut array = [0; 4];
|
||||
let bytes = word.as_bytes();
|
||||
array[..bytes.len()].copy_from_slice(bytes);
|
||||
array
|
||||
};
|
||||
|
||||
// We retrieve the cached postings lists for all
|
||||
// the words that starts with this short prefix.
|
||||
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
||||
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
|
||||
postings.insert(key, result.matches);
|
||||
let prefix_docids = &result.docids;
|
||||
|
||||
// We retrieve the exact postings list for the prefix,
|
||||
// because we must consider these matches as exact.
|
||||
let result = ctx.postings_lists.postings_list(reader, word.as_bytes())?.unwrap_or_default();
|
||||
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
|
||||
postings.insert(key, result.matches);
|
||||
let exact_docids = &result.docids;
|
||||
|
||||
let before = Instant::now();
|
||||
let docids = sdset::duo::Union::new(prefix_docids, exact_docids).into_set_buf();
|
||||
debug!("{:4$}prefix docids ({} and {}) construction took {:.02?}",
|
||||
"", prefix_docids.len(), exact_docids.len(), before.elapsed(), depth * 2);
|
||||
|
||||
Cow::Owned(docids)
|
||||
|
||||
} else {
|
||||
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
|
||||
|
||||
let byte = word.as_bytes()[0];
|
||||
let mut stream = if byte == u8::max_value() {
|
||||
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||
} else {
|
||||
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||
};
|
||||
|
||||
let before = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
while let Some(input) = stream.next() {
|
||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *exact && distance == 0 && input.len() == word.len();
|
||||
results.push(result.docids);
|
||||
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
|
||||
postings.insert(key, result.matches);
|
||||
}
|
||||
}
|
||||
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
|
||||
|
||||
let before = Instant::now();
|
||||
let docids = if results.len() > 10 {
|
||||
let cap = results.iter().map(|dis| dis.len()).sum();
|
||||
let mut docids = Vec::with_capacity(cap);
|
||||
for dis in results {
|
||||
docids.extend_from_slice(&dis);
|
||||
}
|
||||
SetBuf::from_dirty(docids)
|
||||
} else {
|
||||
let sets = results.iter().map(AsRef::as_ref).collect();
|
||||
sdset::multi::Union::new(sets).into_set_buf()
|
||||
};
|
||||
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||
|
||||
Cow::Owned(docids)
|
||||
}
|
||||
},
|
||||
QueryKind::NonTolerant(word) => {
|
||||
// TODO support prefix and non-prefix exact DFA
|
||||
let dfa = build_exact_dfa(word);
|
||||
|
||||
let byte = word.as_bytes()[0];
|
||||
let mut stream = if byte == u8::max_value() {
|
||||
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||
} else {
|
||||
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||
};
|
||||
|
||||
let before = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
while let Some(input) = stream.next() {
|
||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
results.push(result.docids);
|
||||
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: *exact };
|
||||
postings.insert(key, result.matches);
|
||||
}
|
||||
}
|
||||
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
|
||||
|
||||
let before = Instant::now();
|
||||
let docids = if results.len() > 10 {
|
||||
let cap = results.iter().map(|dis| dis.len()).sum();
|
||||
let mut docids = Vec::with_capacity(cap);
|
||||
for dis in results {
|
||||
docids.extend_from_slice(&dis);
|
||||
}
|
||||
SetBuf::from_dirty(docids)
|
||||
} else {
|
||||
let sets = results.iter().map(AsRef::as_ref).collect();
|
||||
sdset::multi::Union::new(sets).into_set_buf()
|
||||
};
|
||||
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||
|
||||
Cow::Owned(docids)
|
||||
},
|
||||
QueryKind::Phrase(words) => {
|
||||
// TODO support prefix and non-prefix exact DFA
|
||||
if let [first, second] = words.as_slice() {
|
||||
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
|
||||
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
|
||||
|
||||
let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| {
|
||||
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
||||
let y = (b.document_id, b.attribute, b.word_index as u32);
|
||||
x.cmp(&y)
|
||||
});
|
||||
|
||||
let matches: Vec<_> = iter
|
||||
.filter_map(EitherOrBoth::both)
|
||||
.flat_map(|(a, b)| once(*a).chain(Some(*b)))
|
||||
.collect();
|
||||
|
||||
let before = Instant::now();
|
||||
let mut docids: Vec<_> = matches.iter().map(|m| m.document_id).collect();
|
||||
docids.dedup();
|
||||
let docids = SetBuf::new(docids).unwrap();
|
||||
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||
|
||||
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
|
||||
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
|
||||
postings.insert(key, matches);
|
||||
|
||||
Cow::Owned(docids)
|
||||
} else {
|
||||
debug!("{:2$}{:?} skipped", "", words, depth * 2);
|
||||
Cow::default()
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
debug!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
let mut cache = Cache::new();
|
||||
let mut postings = Postings::new();
|
||||
|
||||
let docids = match tree {
|
||||
Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||
Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||
Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?,
|
||||
};
|
||||
|
||||
Ok(QueryResult { docids, queries: postings })
|
||||
}
|
415
meilisearch-core/src/query_words_mapper.rs
Normal file
415
meilisearch-core/src/query_words_mapper.rs
Normal file
@ -0,0 +1,415 @@
|
||||
use std::collections::HashMap;
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::Range;
|
||||
use intervaltree::{Element, IntervalTree};
|
||||
|
||||
pub type QueryId = usize;
|
||||
|
||||
pub struct QueryWordsMapper {
|
||||
originals: Vec<String>,
|
||||
mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>,
|
||||
}
|
||||
|
||||
impl QueryWordsMapper {
|
||||
pub fn new<I, A>(originals: I) -> QueryWordsMapper
|
||||
where I: IntoIterator<Item = A>,
|
||||
A: ToString,
|
||||
{
|
||||
let originals = originals.into_iter().map(|s| s.to_string()).collect();
|
||||
QueryWordsMapper { originals, mappings: HashMap::new() }
|
||||
}
|
||||
|
||||
pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I)
|
||||
where I: IntoIterator<Item = A>,
|
||||
A: ToString,
|
||||
{
|
||||
assert!(range.len() != 0);
|
||||
assert!(self.originals.get(range.clone()).is_some());
|
||||
assert!(id >= self.originals.len());
|
||||
|
||||
let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect();
|
||||
|
||||
assert!(!replacement.is_empty());
|
||||
|
||||
// We detect words at the end and at the front of the
|
||||
// replacement that are common with the originals:
|
||||
//
|
||||
// x a b c d e f g
|
||||
// ^^^/ \^^^
|
||||
// a b x c d k j e f
|
||||
// ^^^ ^^^
|
||||
//
|
||||
|
||||
let left = &self.originals[..range.start];
|
||||
let right = &self.originals[range.end..];
|
||||
|
||||
let common_left = longest_common_prefix(left, &replacement);
|
||||
let common_right = longest_common_prefix(&replacement, right);
|
||||
|
||||
for i in 0..common_left {
|
||||
let range = range.start - common_left + i..range.start - common_left + i + 1;
|
||||
let replacement = vec![replacement[i].clone()];
|
||||
self.mappings.insert(id + i, (range, replacement));
|
||||
}
|
||||
|
||||
{
|
||||
let replacement = replacement[common_left..replacement.len() - common_right].iter().cloned().collect();
|
||||
self.mappings.insert(id + common_left, (range.clone(), replacement));
|
||||
}
|
||||
|
||||
for i in 0..common_right {
|
||||
let id = id + replacement.len() - common_right + i;
|
||||
let range = range.end + i..range.end + i + 1;
|
||||
let replacement = vec![replacement[replacement.len() - common_right + i].clone()];
|
||||
self.mappings.insert(id, (range, replacement));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mapping(self) -> HashMap<QueryId, Range<usize>> {
|
||||
let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v)));
|
||||
let intervals = IntervalTree::from_iter(mappings);
|
||||
|
||||
let mut output = HashMap::new();
|
||||
let mut offset = 0;
|
||||
|
||||
// We map each original word to the biggest number of
|
||||
// associated words.
|
||||
for i in 0..self.originals.len() {
|
||||
let max = intervals.query_point(i)
|
||||
.filter_map(|e| {
|
||||
if e.range.end - 1 == i {
|
||||
let len = e.value.1.iter().skip(i - e.range.start).count();
|
||||
if len != 0 { Some(len) } else { None }
|
||||
} else { None }
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(1);
|
||||
|
||||
let range = i + offset..i + offset + max;
|
||||
output.insert(i, range);
|
||||
offset += max - 1;
|
||||
}
|
||||
|
||||
// We retrieve the range that each original word
|
||||
// is mapped to and apply it to each of the words.
|
||||
for i in 0..self.originals.len() {
|
||||
|
||||
let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i);
|
||||
for Element { range, value: (id, words) } in iter {
|
||||
|
||||
// We ask for the complete range mapped to the area we map.
|
||||
let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start);
|
||||
let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end);
|
||||
let range = start..end;
|
||||
|
||||
// We map each query id to one word until the last,
|
||||
// we map it to the remainings words.
|
||||
let add = range.len() - words.len();
|
||||
for (j, x) in range.take(words.len()).enumerate() {
|
||||
let add = if j == words.len() - 1 { add } else { 0 }; // is last?
|
||||
let range = x..x + 1 + add;
|
||||
output.insert(id + j, range);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize {
|
||||
let mut best = None;
|
||||
for i in (0..a.len()).rev() {
|
||||
let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count();
|
||||
best = match best {
|
||||
Some(old) if count > old => Some(count),
|
||||
Some(_) => break,
|
||||
None => Some(count),
|
||||
};
|
||||
}
|
||||
best.unwrap_or(0)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// new = new york city
|
||||
builder.declare(0..1, 7, &["new", "york", "city"]);
|
||||
// ^ 7 8 9
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // new
|
||||
assert_eq!(mapping[&1], 1..2); // york
|
||||
assert_eq!(mapping[&2], 2..3); // city
|
||||
assert_eq!(mapping[&3], 3..4); // subway
|
||||
|
||||
assert_eq!(mapping[&4], 0..1); // new
|
||||
assert_eq!(mapping[&5], 1..2); // york
|
||||
assert_eq!(mapping[&6], 2..3); // city
|
||||
|
||||
assert_eq!(mapping[&7], 0..1); // new
|
||||
assert_eq!(mapping[&8], 1..2); // york
|
||||
assert_eq!(mapping[&9], 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn original_unmodified2() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// city subway = new york city underground train
|
||||
builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]);
|
||||
// ^ 4 5 6 7 8
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // new
|
||||
assert_eq!(mapping[&1], 1..2); // york
|
||||
assert_eq!(mapping[&2], 2..3); // city
|
||||
assert_eq!(mapping[&3], 3..5); // subway
|
||||
|
||||
assert_eq!(mapping[&4], 0..1); // new
|
||||
assert_eq!(mapping[&5], 1..2); // york
|
||||
assert_eq!(mapping[&6], 2..3); // city
|
||||
assert_eq!(mapping[&7], 3..4); // underground
|
||||
assert_eq!(mapping[&8], 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn original_unmodified3() {
|
||||
let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"];
|
||||
// 0 1 2 3 4 5 6 7 8 9 10
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// c d = a b x c d k j e f
|
||||
builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]);
|
||||
// ^^ 11 12 13 14 15 16 17 18 19
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // a
|
||||
assert_eq!(mapping[&1], 1..2); // b
|
||||
assert_eq!(mapping[&2], 2..3); // x
|
||||
assert_eq!(mapping[&3], 3..4); // x
|
||||
assert_eq!(mapping[&4], 4..5); // a
|
||||
assert_eq!(mapping[&5], 5..6); // b
|
||||
assert_eq!(mapping[&6], 6..7); // c
|
||||
assert_eq!(mapping[&7], 7..11); // d
|
||||
assert_eq!(mapping[&8], 11..12); // e
|
||||
assert_eq!(mapping[&9], 12..13); // f
|
||||
assert_eq!(mapping[&10], 13..14); // g
|
||||
|
||||
assert_eq!(mapping[&11], 4..5); // a
|
||||
assert_eq!(mapping[&12], 5..6); // b
|
||||
assert_eq!(mapping[&13], 6..7); // x
|
||||
assert_eq!(mapping[&14], 7..8); // c
|
||||
assert_eq!(mapping[&15], 8..9); // d
|
||||
assert_eq!(mapping[&16], 9..10); // k
|
||||
assert_eq!(mapping[&17], 10..11); // j
|
||||
assert_eq!(mapping[&18], 11..12); // e
|
||||
assert_eq!(mapping[&19], 12..13); // f
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // new
|
||||
assert_eq!(mapping[&1], 1..3); // york
|
||||
assert_eq!(mapping[&2], 3..4); // subway
|
||||
assert_eq!(mapping[&3], 0..1); // new
|
||||
assert_eq!(mapping[&4], 1..2); // york
|
||||
assert_eq!(mapping[&5], 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..3); // NY
|
||||
assert_eq!(mapping[&1], 3..5); // subway
|
||||
assert_eq!(mapping[&2], 0..1); // new
|
||||
assert_eq!(mapping[&3], 1..3); // york
|
||||
assert_eq!(mapping[&4], 0..1); // new
|
||||
assert_eq!(mapping[&5], 1..2); // york
|
||||
assert_eq!(mapping[&6], 2..3); // city
|
||||
assert_eq!(mapping[&7], 0..3); // NYC
|
||||
assert_eq!(mapping[&8], 0..1); // new
|
||||
assert_eq!(mapping[&9], 1..2); // york
|
||||
assert_eq!(mapping[&10], 2..3); // city
|
||||
assert_eq!(mapping[&11], 3..4); // underground
|
||||
assert_eq!(mapping[&12], 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..3); // NYC
|
||||
assert_eq!(mapping[&1], 3..4); // subway
|
||||
assert_eq!(mapping[&2], 0..1); // new
|
||||
assert_eq!(mapping[&3], 1..2); // york
|
||||
assert_eq!(mapping[&4], 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // great
|
||||
assert_eq!(mapping[&1], 1..2); // awesome
|
||||
assert_eq!(mapping[&2], 2..5); // NYC
|
||||
assert_eq!(mapping[&3], 5..6); // subway
|
||||
assert_eq!(mapping[&4], 2..3); // new
|
||||
assert_eq!(mapping[&5], 3..4); // york
|
||||
assert_eq!(mapping[&6], 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // NYC
|
||||
assert_eq!(mapping[&1], 1..3); // subway
|
||||
assert_eq!(mapping[&2], 1..2); // underground
|
||||
assert_eq!(mapping[&3], 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // great
|
||||
assert_eq!(mapping[&1], 1..2); // awesome
|
||||
assert_eq!(mapping[&2], 2..5); // NYC
|
||||
assert_eq!(mapping[&3], 5..7); // subway
|
||||
assert_eq!(mapping[&4], 2..3); // new
|
||||
assert_eq!(mapping[&5], 3..4); // york
|
||||
assert_eq!(mapping[&6], 4..5); // city
|
||||
assert_eq!(mapping[&7], 5..6); // underground
|
||||
assert_eq!(mapping[&8], 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryWordsMapper::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let mapping = builder.mapping();
|
||||
|
||||
assert_eq!(mapping[&0], 0..1); // great
|
||||
assert_eq!(mapping[&1], 1..2); // awesome
|
||||
assert_eq!(mapping[&2], 2..5); // NYC
|
||||
assert_eq!(mapping[&3], 5..7); // subway
|
||||
assert_eq!(mapping[&4], 2..3); // new
|
||||
assert_eq!(mapping[&5], 3..4); // york
|
||||
assert_eq!(mapping[&6], 4..5); // city
|
||||
assert_eq!(mapping[&7], 5..6); // underground
|
||||
assert_eq!(mapping[&8], 6..7); // train
|
||||
assert_eq!(mapping[&9], 0..2); // good
|
||||
assert_eq!(mapping[&10], 1..5); // NY
|
||||
assert_eq!(mapping[&11], 2..7); // metro
|
||||
}
|
||||
}
|
@ -1,8 +1,7 @@
|
||||
use compact_arena::SmallArena;
|
||||
use itertools::EitherOrBoth;
|
||||
use sdset::SetBuf;
|
||||
use crate::DocIndex;
|
||||
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
||||
use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView};
|
||||
use crate::reordered_attrs::ReorderedAttrs;
|
||||
|
||||
pub struct RawDocument<'a, 'tag> {
|
||||
@ -19,10 +18,9 @@ pub struct RawDocument<'a, 'tag> {
|
||||
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||
pub fn new<'txn>(
|
||||
bare_matches: &'a mut [BareMatch<'tag>],
|
||||
automatons: &[QueryWordAutomaton],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
searchable_attrs: Option<&ReorderedAttrs>,
|
||||
) -> Option<RawDocument<'a, 'tag>>
|
||||
) -> RawDocument<'a, 'tag>
|
||||
{
|
||||
if let Some(reordered_attrs) = searchable_attrs {
|
||||
for bm in bare_matches.iter() {
|
||||
@ -42,70 +40,12 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||
|
||||
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
||||
|
||||
let mut previous_word = None;
|
||||
for i in 0..bare_matches.len() {
|
||||
let a = &bare_matches[i];
|
||||
let auta = &automatons[a.query_index as usize];
|
||||
|
||||
match auta.phrase_query {
|
||||
Some((0, _)) => {
|
||||
let b = match bare_matches.get(i + 1) {
|
||||
Some(b) => b,
|
||||
None => {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if a.query_index + 1 != b.query_index {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
continue
|
||||
}
|
||||
|
||||
let pla = &postings_lists[a.postings_list];
|
||||
let plb = &postings_lists[b.postings_list];
|
||||
|
||||
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
||||
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
||||
});
|
||||
|
||||
let mut newa = Vec::new();
|
||||
let mut newb = Vec::new();
|
||||
|
||||
for eb in iter {
|
||||
if let EitherOrBoth::Both(a, b) = eb {
|
||||
newa.push(*a);
|
||||
newb.push(*b);
|
||||
}
|
||||
}
|
||||
|
||||
if !newa.is_empty() {
|
||||
previous_word = Some(a.query_index);
|
||||
}
|
||||
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
||||
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
||||
},
|
||||
Some((1, _)) => {
|
||||
if previous_word.take() != Some(a.query_index - 1) {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
}
|
||||
},
|
||||
Some((_, _)) => unreachable!(),
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
||||
if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
||||
return None
|
||||
}
|
||||
|
||||
Some(RawDocument {
|
||||
RawDocument {
|
||||
id: bare_matches[0].document_id,
|
||||
bare_matches,
|
||||
processed_matches: Vec::new(),
|
||||
processed_distances: Vec::new(),
|
||||
contains_one_word_field: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -67,6 +67,17 @@ impl Main {
|
||||
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes)
|
||||
}
|
||||
|
||||
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
||||
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||
Some(bytes) => {
|
||||
let bytes: &'static [u8] = std::mem::transmute(bytes);
|
||||
let set = fst::Set::from_static_slice(bytes).unwrap();
|
||||
Ok(Some(set))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
||||
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||
Some(bytes) => {
|
||||
|
@ -1,4 +1,6 @@
|
||||
mod docs_words;
|
||||
mod prefix_documents_cache;
|
||||
mod prefix_postings_lists_cache;
|
||||
mod documents_fields;
|
||||
mod documents_fields_counts;
|
||||
mod main;
|
||||
@ -8,6 +10,8 @@ mod updates;
|
||||
mod updates_results;
|
||||
|
||||
pub use self::docs_words::DocsWords;
|
||||
pub use self::prefix_documents_cache::PrefixDocumentsCache;
|
||||
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
|
||||
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||
pub use self::documents_fields_counts::{
|
||||
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
||||
@ -18,10 +22,15 @@ pub use self::synonyms::Synonyms;
|
||||
pub use self::updates::Updates;
|
||||
pub use self::updates_results::UpdatesResults;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::convert::TryInto;
|
||||
use std::{mem, ptr};
|
||||
|
||||
use heed::Result as ZResult;
|
||||
use heed::{BytesEncode, BytesDecode};
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
use sdset::{Set, SetBuf};
|
||||
use serde::de::{self, Deserialize};
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
@ -29,7 +38,7 @@ use crate::criterion::Criteria;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::database::{MainT, UpdateT};
|
||||
use crate::serde::Deserializer;
|
||||
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
||||
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
|
||||
|
||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||
@ -50,6 +59,87 @@ impl DocumentAttrKey {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct Postings<'a> {
|
||||
pub docids: Cow<'a, Set<DocumentId>>,
|
||||
pub matches: Cow<'a, Set<DocIndex>>,
|
||||
}
|
||||
|
||||
pub struct PostingsCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for PostingsCodec {
|
||||
type EItem = Postings<'a>;
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||
let u64_size = mem::size_of::<u64>();
|
||||
let docids_size = item.docids.len() * mem::size_of::<DocumentId>();
|
||||
let matches_size = item.matches.len() * mem::size_of::<DocIndex>();
|
||||
|
||||
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
|
||||
|
||||
let docids_len = item.docids.len();
|
||||
buffer.extend_from_slice(&docids_len.to_be_bytes());
|
||||
buffer.extend_from_slice(item.docids.as_bytes());
|
||||
buffer.extend_from_slice(item.matches.as_bytes());
|
||||
|
||||
Some(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
||||
(bytes as *const _ as *const () as usize) % align == 0
|
||||
}
|
||||
|
||||
fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option<Cow<'a, Set<T>>>
|
||||
where T: Clone + FromBytes
|
||||
{
|
||||
match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) {
|
||||
Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))),
|
||||
None => {
|
||||
let len = bytes.len();
|
||||
let elem_size = mem::size_of::<T>();
|
||||
|
||||
// ensure that it is the alignment that is wrong
|
||||
// and the length is valid
|
||||
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<T>()) {
|
||||
let elems = len / elem_size;
|
||||
let mut vec = Vec::<T>::with_capacity(elems);
|
||||
|
||||
unsafe {
|
||||
let dst = vec.as_mut_ptr() as *mut u8;
|
||||
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
|
||||
vec.set_len(elems);
|
||||
}
|
||||
|
||||
return Some(Cow::Owned(SetBuf::new_unchecked(vec)));
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for PostingsCodec {
|
||||
type DItem = Postings<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let u64_size = mem::size_of::<u64>();
|
||||
let docid_size = mem::size_of::<DocumentId>();
|
||||
|
||||
let (len_bytes, bytes) = bytes.split_at(u64_size);
|
||||
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;
|
||||
let docids_size = docids_len * docid_size;
|
||||
|
||||
let docids_bytes = &bytes[..docids_size];
|
||||
let matches_bytes = &bytes[docids_size..];
|
||||
|
||||
let docids = from_bytes_to_set(docids_bytes)?;
|
||||
let matches = from_bytes_to_set(matches_bytes)?;
|
||||
|
||||
Some(Postings { docids, matches })
|
||||
}
|
||||
}
|
||||
|
||||
fn main_name(name: &str) -> String {
|
||||
format!("store-{}", name)
|
||||
}
|
||||
@ -74,6 +164,14 @@ fn docs_words_name(name: &str) -> String {
|
||||
format!("store-{}-docs-words", name)
|
||||
}
|
||||
|
||||
fn prefix_documents_cache_name(name: &str) -> String {
|
||||
format!("store-{}-prefix-documents-cache", name)
|
||||
}
|
||||
|
||||
fn prefix_postings_lists_cache_name(name: &str) -> String {
|
||||
format!("store-{}-prefix-postings-lists-cache", name)
|
||||
}
|
||||
|
||||
fn updates_name(name: &str) -> String {
|
||||
format!("store-{}-updates", name)
|
||||
}
|
||||
@ -90,6 +188,8 @@ pub struct Index {
|
||||
pub documents_fields_counts: DocumentsFieldsCounts,
|
||||
pub synonyms: Synonyms,
|
||||
pub docs_words: DocsWords,
|
||||
pub prefix_documents_cache: PrefixDocumentsCache,
|
||||
pub prefix_postings_lists_cache: PrefixPostingsListsCache,
|
||||
|
||||
pub updates: Updates,
|
||||
pub updates_results: UpdatesResults,
|
||||
@ -142,7 +242,7 @@ impl Index {
|
||||
|
||||
pub fn schema_update(&self, writer: &mut heed::RwTxn<UpdateT>, schema: Schema) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
update::push_schema_update(writer, self.updates, self.updates_results, schema)
|
||||
update::push_schema_update(writer, self, schema)
|
||||
}
|
||||
|
||||
pub fn customs_update(&self, writer: &mut heed::RwTxn<UpdateT>, customs: Vec<u8>) -> ZResult<u64> {
|
||||
@ -252,6 +352,8 @@ impl Index {
|
||||
self.postings_lists,
|
||||
self.documents_fields_counts,
|
||||
self.synonyms,
|
||||
self.prefix_documents_cache,
|
||||
self.prefix_postings_lists_cache,
|
||||
)
|
||||
}
|
||||
|
||||
@ -264,6 +366,8 @@ impl Index {
|
||||
self.postings_lists,
|
||||
self.documents_fields_counts,
|
||||
self.synonyms,
|
||||
self.prefix_documents_cache,
|
||||
self.prefix_postings_lists_cache,
|
||||
criteria,
|
||||
)
|
||||
}
|
||||
@ -282,6 +386,8 @@ pub fn create(
|
||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||
let synonyms_name = synonyms_name(name);
|
||||
let docs_words_name = docs_words_name(name);
|
||||
let prefix_documents_cache_name = prefix_documents_cache_name(name);
|
||||
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
|
||||
let updates_name = updates_name(name);
|
||||
let updates_results_name = updates_results_name(name);
|
||||
|
||||
@ -292,6 +398,8 @@ pub fn create(
|
||||
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
|
||||
let synonyms = env.create_database(Some(&synonyms_name))?;
|
||||
let docs_words = env.create_database(Some(&docs_words_name))?;
|
||||
let prefix_documents_cache = env.create_database(Some(&prefix_documents_cache_name))?;
|
||||
let prefix_postings_lists_cache = env.create_database(Some(&prefix_postings_lists_cache_name))?;
|
||||
let updates = update_env.create_database(Some(&updates_name))?;
|
||||
let updates_results = update_env.create_database(Some(&updates_results_name))?;
|
||||
|
||||
@ -299,11 +407,11 @@ pub fn create(
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
|
||||
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
|
||||
updates: Updates { updates },
|
||||
updates_results: UpdatesResults { updates_results },
|
||||
updates_notifier,
|
||||
@ -323,6 +431,8 @@ pub fn open(
|
||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||
let synonyms_name = synonyms_name(name);
|
||||
let docs_words_name = docs_words_name(name);
|
||||
let prefix_documents_cache_name = prefix_documents_cache_name(name);
|
||||
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
|
||||
let updates_name = updates_name(name);
|
||||
let updates_results_name = updates_results_name(name);
|
||||
|
||||
@ -351,6 +461,14 @@ pub fn open(
|
||||
Some(docs_words) => docs_words,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let prefix_documents_cache = match env.open_database(Some(&prefix_documents_cache_name))? {
|
||||
Some(prefix_documents_cache) => prefix_documents_cache,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let prefix_postings_lists_cache = match env.open_database(Some(&prefix_postings_lists_cache_name))? {
|
||||
Some(prefix_postings_lists_cache) => prefix_postings_lists_cache,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let updates = match update_env.open_database(Some(&updates_name))? {
|
||||
Some(updates) => updates,
|
||||
None => return Ok(None),
|
||||
@ -364,11 +482,11 @@ pub fn open(
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
|
||||
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
|
||||
updates: Updates { updates },
|
||||
updates_results: UpdatesResults { updates_results },
|
||||
updates_notifier,
|
||||
@ -387,6 +505,8 @@ pub fn clear(
|
||||
index.documents_fields_counts.clear(writer)?;
|
||||
index.synonyms.clear(writer)?;
|
||||
index.docs_words.clear(writer)?;
|
||||
index.prefix_documents_cache.clear(writer)?;
|
||||
index.prefix_postings_lists_cache.clear(writer)?;
|
||||
index.updates.clear(update_writer)?;
|
||||
index.updates_results.clear(update_writer)?;
|
||||
Ok(())
|
||||
|
@ -1,13 +1,17 @@
|
||||
use crate::DocIndex;
|
||||
use crate::database::MainT;
|
||||
use heed::types::{ByteSlice, CowSlice};
|
||||
use heed::Result as ZResult;
|
||||
use sdset::{Set, SetBuf};
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::Result as ZResult;
|
||||
use heed::types::ByteSlice;
|
||||
use sdset::{Set, SetBuf};
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::DocIndex;
|
||||
use crate::store::{Postings, PostingsCodec};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PostingsLists {
|
||||
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
|
||||
pub(crate) postings_lists: heed::Database<ByteSlice, PostingsCodec>,
|
||||
}
|
||||
|
||||
impl PostingsLists {
|
||||
@ -15,9 +19,14 @@ impl PostingsLists {
|
||||
self,
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
word: &[u8],
|
||||
words_indexes: &Set<DocIndex>,
|
||||
matches: &Set<DocIndex>,
|
||||
) -> ZResult<()> {
|
||||
self.postings_lists.put(writer, word, words_indexes)
|
||||
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
|
||||
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
|
||||
let matches = Cow::Borrowed(matches);
|
||||
let postings = Postings { docids, matches };
|
||||
|
||||
self.postings_lists.put(writer, word, &postings)
|
||||
}
|
||||
|
||||
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
|
||||
@ -32,11 +41,7 @@ impl PostingsLists {
|
||||
self,
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
word: &[u8],
|
||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
||||
match self.postings_lists.get(reader, word)? {
|
||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
||||
None => Ok(None),
|
||||
}
|
||||
) -> ZResult<Option<Postings<'txn>>> {
|
||||
self.postings_lists.get(reader, word)
|
||||
}
|
||||
}
|
||||
|
80
meilisearch-core/src/store/prefix_documents_cache.rs
Normal file
80
meilisearch-core/src/store/prefix_documents_cache.rs
Normal file
@ -0,0 +1,80 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::types::{OwnedType, CowSlice};
|
||||
use heed::Result as ZResult;
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
use super::BEU64;
|
||||
use crate::{DocumentId, Highlight};
|
||||
use crate::database::MainT;
|
||||
|
||||
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||
#[repr(C)]
|
||||
pub struct PrefixKey {
|
||||
prefix: [u8; 4],
|
||||
index: BEU64,
|
||||
docid: BEU64,
|
||||
}
|
||||
|
||||
impl PrefixKey {
|
||||
pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey {
|
||||
PrefixKey {
|
||||
prefix: prefix,
|
||||
index: BEU64::new(index),
|
||||
docid: BEU64::new(docid),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PrefixDocumentsCache {
|
||||
pub(crate) prefix_documents_cache: heed::Database<OwnedType<PrefixKey>, CowSlice<Highlight>>,
|
||||
}
|
||||
|
||||
impl PrefixDocumentsCache {
|
||||
pub fn put_prefix_document(
|
||||
self,
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
prefix: [u8; 4],
|
||||
index: usize,
|
||||
docid: DocumentId,
|
||||
highlights: &[Highlight],
|
||||
) -> ZResult<()> {
|
||||
let key = PrefixKey::new(prefix, index as u64, docid.0);
|
||||
self.prefix_documents_cache.put(writer, &key, highlights)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
|
||||
self.prefix_documents_cache.clear(writer)
|
||||
}
|
||||
|
||||
pub fn prefix_documents<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
prefix: [u8; 4],
|
||||
) -> ZResult<PrefixDocumentsIter<'txn>> {
|
||||
let start = PrefixKey::new(prefix, 0, 0);
|
||||
let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value());
|
||||
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
|
||||
Ok(PrefixDocumentsIter { iter })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PrefixDocumentsIter<'txn> {
|
||||
iter: heed::RoRange<'txn, OwnedType<PrefixKey>, CowSlice<Highlight>>,
|
||||
}
|
||||
|
||||
impl<'txn> Iterator for PrefixDocumentsIter<'txn> {
|
||||
type Item = ZResult<(DocumentId, Cow<'txn, [Highlight]>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok((key, highlights))) => {
|
||||
let docid = DocumentId(key.docid.get());
|
||||
Some(Ok((docid, highlights)))
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
45
meilisearch-core/src/store/prefix_postings_lists_cache.rs
Normal file
45
meilisearch-core/src/store/prefix_postings_lists_cache.rs
Normal file
@ -0,0 +1,45 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::Result as ZResult;
|
||||
use heed::types::OwnedType;
|
||||
use sdset::{Set, SetBuf};
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::DocIndex;
|
||||
use crate::store::{PostingsCodec, Postings};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PrefixPostingsListsCache {
|
||||
pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, PostingsCodec>,
|
||||
}
|
||||
|
||||
impl PrefixPostingsListsCache {
|
||||
pub fn put_prefix_postings_list(
|
||||
self,
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
prefix: [u8; 4],
|
||||
matches: &Set<DocIndex>,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
|
||||
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
|
||||
let matches = Cow::Borrowed(matches);
|
||||
let postings = Postings { docids, matches };
|
||||
|
||||
self.prefix_postings_lists_cache.put(writer, &prefix, &postings)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
|
||||
self.prefix_postings_lists_cache.clear(writer)
|
||||
}
|
||||
|
||||
pub fn prefix_postings_list<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
prefix: [u8; 4],
|
||||
) -> ZResult<Option<Postings<'txn>>>
|
||||
{
|
||||
self.prefix_postings_lists_cache.get(reader, &prefix)
|
||||
}
|
||||
}
|
@ -4,19 +4,17 @@ use crate::{store, MResult, RankedMap};
|
||||
|
||||
pub fn apply_clear_all(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
) -> MResult<()> {
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &RankedMap::default())?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
documents_fields_store.clear(writer)?;
|
||||
documents_fields_counts_store.clear(writer)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
index.main.put_words_fst(writer, &fst::Set::default())?;
|
||||
index.main.put_ranked_map(writer, &RankedMap::default())?;
|
||||
index.main.put_number_of_documents(writer, |_| 0)?;
|
||||
index.documents_fields.clear(writer)?;
|
||||
index.documents_fields_counts.clear(writer)?;
|
||||
index.postings_lists.clear(writer)?;
|
||||
index.docs_words.clear(writer)?;
|
||||
index.prefix_documents_cache.clear(writer)?;
|
||||
index.prefix_postings_lists_cache.clear(writer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
|
||||
use crate::store;
|
||||
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
||||
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
|
||||
use crate::{Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsAddition<D> {
|
||||
@ -104,16 +104,12 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
||||
|
||||
pub fn apply_documents_addition<'a, 'b>(
|
||||
writer: &'a mut heed::RwTxn<'b, MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||
) -> MResult<()> {
|
||||
let mut documents_additions = HashMap::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
let schema = match index.main.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
@ -133,22 +129,14 @@ pub fn apply_documents_addition<'a, 'b>(
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_additions.len();
|
||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids,
|
||||
)?;
|
||||
apply_documents_deletion(writer, index, documents_ids)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
@ -160,8 +148,8 @@ pub fn apply_documents_addition<'a, 'b>(
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
document_store: documents_fields_store,
|
||||
document_fields_counts: documents_fields_counts_store,
|
||||
document_store: index.documents_fields,
|
||||
document_fields_counts: index.documents_fields_counts,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
@ -172,27 +160,25 @@ pub fn apply_documents_addition<'a, 'b>(
|
||||
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
index,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
)?;
|
||||
|
||||
compute_short_prefixes(writer, index)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
writer: &'a mut heed::RwTxn<'b, MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||
) -> MResult<()> {
|
||||
let mut documents_additions = HashMap::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
let schema = match index.main.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
@ -209,7 +195,7 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
let mut deserializer = Deserializer {
|
||||
document_id,
|
||||
reader: writer,
|
||||
documents_fields: documents_fields_store,
|
||||
documents_fields: index.documents_fields,
|
||||
schema: &schema,
|
||||
attributes: None,
|
||||
};
|
||||
@ -229,22 +215,14 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_additions.len();
|
||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids,
|
||||
)?;
|
||||
apply_documents_deletion(writer, index, documents_ids)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
@ -256,8 +234,8 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
document_store: documents_fields_store,
|
||||
document_fields_counts: documents_fields_counts_store,
|
||||
document_store: index.documents_fields,
|
||||
document_fields_counts: index.documents_fields_counts,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
@ -268,24 +246,19 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
index,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
)?;
|
||||
|
||||
compute_short_prefixes(writer, index)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn reindex_all_documents(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
let schema = match main_store.schema(writer)? {
|
||||
pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
|
||||
let schema = match index.main.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
@ -294,21 +267,21 @@ pub fn reindex_all_documents(
|
||||
|
||||
// 1. retrieve all documents ids
|
||||
let mut documents_ids_to_reindex = Vec::new();
|
||||
for result in documents_fields_counts_store.documents_ids(writer)? {
|
||||
for result in index.documents_fields_counts.documents_ids(writer)? {
|
||||
let document_id = result?;
|
||||
documents_ids_to_reindex.push(document_id);
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
index.main.put_words_fst(writer, &fst::Set::default())?;
|
||||
index.main.put_ranked_map(writer, &ranked_map)?;
|
||||
index.main.put_number_of_documents(writer, |_| 0)?;
|
||||
index.postings_lists.clear(writer)?;
|
||||
index.docs_words.clear(writer)?;
|
||||
|
||||
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
|
||||
for documents_ids in documents_ids_to_reindex.chunks(100) {
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
@ -318,7 +291,7 @@ pub fn reindex_all_documents(
|
||||
let mut ram_store = HashMap::new();
|
||||
|
||||
for document_id in documents_ids {
|
||||
for result in documents_fields_store.document_fields(writer, *document_id)? {
|
||||
for result in index.documents_fields.document_fields(writer, *document_id)? {
|
||||
let (attr, bytes) = result?;
|
||||
let value: serde_json::Value = serde_json::from_slice(bytes)?;
|
||||
ram_store.insert((document_id, attr), value);
|
||||
@ -330,8 +303,8 @@ pub fn reindex_all_documents(
|
||||
attr,
|
||||
schema.props(attr),
|
||||
*docid,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
&mut indexer,
|
||||
&mut ranked_map,
|
||||
&value,
|
||||
@ -342,23 +315,21 @@ pub fn reindex_all_documents(
|
||||
// 4. write the new index in the main store
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
index,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)?;
|
||||
}
|
||||
|
||||
compute_short_prefixes(writer, index)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_documents_addition_index(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
ranked_map: &RankedMap,
|
||||
number_of_inserted_documents: usize,
|
||||
indexer: RawIndexer,
|
||||
@ -369,16 +340,16 @@ pub fn write_documents_addition_index(
|
||||
for (word, delta_set) in indexed.words_doc_indexes {
|
||||
delta_words_builder.insert(&word).unwrap();
|
||||
|
||||
let set = match postings_lists_store.postings_list(writer, &word)? {
|
||||
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
|
||||
let set = match index.postings_lists.postings_list(writer, &word)? {
|
||||
Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(),
|
||||
None => delta_set,
|
||||
};
|
||||
|
||||
postings_lists_store.put_postings_list(writer, &word, &set)?;
|
||||
index.postings_lists.put_postings_list(writer, &word, &set)?;
|
||||
}
|
||||
|
||||
for (id, words) in indexed.docs_words {
|
||||
docs_words_store.put_doc_words(writer, id, &words)?;
|
||||
index.docs_words.put_doc_words(writer, id, &words)?;
|
||||
}
|
||||
|
||||
let delta_words = delta_words_builder
|
||||
@ -386,7 +357,7 @@ pub fn write_documents_addition_index(
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
let words = match index.main.words_fst(writer)? {
|
||||
Some(words) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(words.stream())
|
||||
@ -403,9 +374,11 @@ pub fn write_documents_addition_index(
|
||||
None => delta_words,
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||
index.main.put_words_fst(writer, &words)?;
|
||||
index.main.put_ranked_map(writer, ranked_map)?;
|
||||
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||
|
||||
compute_short_prefixes(writer, index)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ use crate::database::{MainT, UpdateT};
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::serde::extract_document_id;
|
||||
use crate::store;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::update::{next_update_id, compute_short_prefixes, Update};
|
||||
use crate::{DocumentId, Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsDeletion {
|
||||
@ -85,21 +85,17 @@ pub fn push_documents_deletion(
|
||||
|
||||
pub fn apply_documents_deletion(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<()> {
|
||||
let idset = SetBuf::from_dirty(deletion);
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
let schema = match index.main.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
@ -125,7 +121,7 @@ pub fn apply_documents_deletion(
|
||||
ranked_map.remove(id, *ranked_attr);
|
||||
}
|
||||
|
||||
if let Some(words) = docs_words_store.doc_words(writer, id)? {
|
||||
if let Some(words) = index.docs_words.doc_words(writer, id)? {
|
||||
let mut stream = words.stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let word = word.to_vec();
|
||||
@ -142,21 +138,21 @@ pub fn apply_documents_deletion(
|
||||
for (word, document_ids) in words_document_ids {
|
||||
let document_ids = SetBuf::from_dirty(document_ids);
|
||||
|
||||
if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? {
|
||||
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
|
||||
if let Some(postings) = index.postings_lists.postings_list(writer, &word)? {
|
||||
let op = DifferenceByKey::new(&postings.matches, &document_ids, |d| d.document_id, |id| *id);
|
||||
let doc_indexes = op.into_set_buf();
|
||||
|
||||
if !doc_indexes.is_empty() {
|
||||
postings_lists_store.put_postings_list(writer, &word, &doc_indexes)?;
|
||||
index.postings_lists.put_postings_list(writer, &word, &doc_indexes)?;
|
||||
} else {
|
||||
postings_lists_store.del_postings_list(writer, &word)?;
|
||||
index.postings_lists.del_postings_list(writer, &word)?;
|
||||
removed_words.insert(word);
|
||||
}
|
||||
}
|
||||
|
||||
for id in document_ids {
|
||||
documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
|
||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||
index.documents_fields_counts.del_all_document_fields_counts(writer, id)?;
|
||||
if index.documents_fields.del_all_document_fields(writer, id)? != 0 {
|
||||
deleted_documents.insert(id);
|
||||
}
|
||||
}
|
||||
@ -164,11 +160,11 @@ pub fn apply_documents_deletion(
|
||||
|
||||
let deleted_documents_len = deleted_documents.len() as u64;
|
||||
for id in deleted_documents {
|
||||
docs_words_store.del_doc_words(writer, id)?;
|
||||
index.docs_words.del_doc_words(writer, id)?;
|
||||
}
|
||||
|
||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
let words = match index.main.words_fst(writer)? {
|
||||
Some(words_set) => {
|
||||
let op = fst::set::OpBuilder::new()
|
||||
.add(words_set.stream())
|
||||
@ -185,9 +181,11 @@ pub fn apply_documents_deletion(
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||
index.main.put_words_fst(writer, &words)?;
|
||||
index.main.put_ranked_map(writer, &ranked_map)?;
|
||||
index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||
|
||||
compute_short_prefixes(writer, index)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -26,6 +26,8 @@ use chrono::{DateTime, Utc};
|
||||
use heed::Result as ZResult;
|
||||
use log::debug;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::{store, DocumentId, MResult};
|
||||
use crate::database::{MainT, UpdateT};
|
||||
@ -255,14 +257,7 @@ pub fn update_task<'a, 'b>(
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::ClearAll;
|
||||
let result = apply_clear_all(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
let result = apply_clear_all(writer, index);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -270,15 +265,7 @@ pub fn update_task<'a, 'b>(
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::Schema;
|
||||
let result = apply_schema_update(
|
||||
writer,
|
||||
&schema,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
let result = apply_schema_update(writer, &schema, index);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -297,15 +284,7 @@ pub fn update_task<'a, 'b>(
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
let result = apply_documents_addition(writer, index, documents);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -316,15 +295,7 @@ pub fn update_task<'a, 'b>(
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_partial_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
let result = apply_documents_partial_addition(writer, index, documents);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -335,15 +306,7 @@ pub fn update_task<'a, 'b>(
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
let result = apply_documents_deletion(writer, index, documents);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -377,15 +340,7 @@ pub fn update_task<'a, 'b>(
|
||||
number: stop_words.len(),
|
||||
};
|
||||
|
||||
let result = apply_stop_words_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
stop_words,
|
||||
);
|
||||
let result = apply_stop_words_deletion(writer, index, stop_words);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -407,3 +362,67 @@ pub fn update_task<'a, 'b>(
|
||||
|
||||
Ok(status)
|
||||
}
|
||||
|
||||
fn compute_short_prefixes(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
|
||||
// retrieve the words fst to compute all those prefixes
|
||||
let words_fst = match index.main.words_fst(writer)? {
|
||||
Some(fst) => fst,
|
||||
None => return Ok(()),
|
||||
};
|
||||
|
||||
// clear the prefixes
|
||||
let pplc_store = index.prefix_postings_lists_cache;
|
||||
pplc_store.clear(writer)?;
|
||||
|
||||
for prefix_len in 1..=2 {
|
||||
// compute prefixes and store those in the PrefixPostingsListsCache store.
|
||||
let mut previous_prefix: Option<([u8; 4], Vec<_>)> = None;
|
||||
let mut stream = words_fst.into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
|
||||
// We skip the prefixes that are shorter than the current length
|
||||
// we want to cache (<). We must ignore the input when it is exactly the
|
||||
// same word as the prefix because if we match exactly on it we need
|
||||
// to consider it as an exact match and not as a prefix (=).
|
||||
if input.len() <= prefix_len { continue }
|
||||
|
||||
if let Some(postings_list) = index.postings_lists.postings_list(writer, input)?.map(|p| p.matches.into_owned()) {
|
||||
let prefix = &input[..prefix_len];
|
||||
|
||||
let mut arr_prefix = [0; 4];
|
||||
arr_prefix[..prefix_len].copy_from_slice(prefix);
|
||||
|
||||
match previous_prefix {
|
||||
Some((ref mut prev_prefix, ref mut prev_pl)) if *prev_prefix != arr_prefix => {
|
||||
prev_pl.sort_unstable();
|
||||
prev_pl.dedup();
|
||||
|
||||
if let Ok(prefix) = std::str::from_utf8(&prev_prefix[..prefix_len]) {
|
||||
debug!("writing the prefix of {:?} of length {}", prefix, prev_pl.len());
|
||||
}
|
||||
|
||||
let pls = Set::new_unchecked(&prev_pl);
|
||||
pplc_store.put_prefix_postings_list(writer, *prev_prefix, &pls)?;
|
||||
|
||||
*prev_prefix = arr_prefix;
|
||||
prev_pl.clear();
|
||||
prev_pl.extend_from_slice(&postings_list);
|
||||
},
|
||||
Some((_, ref mut prev_pl)) => prev_pl.extend_from_slice(&postings_list),
|
||||
None => previous_prefix = Some((arr_prefix, postings_list.to_vec())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write the last prefix postings lists
|
||||
if let Some((prev_prefix, mut prev_pl)) = previous_prefix.take() {
|
||||
prev_pl.sort_unstable();
|
||||
prev_pl.dedup();
|
||||
|
||||
let pls = Set::new_unchecked(&prev_pl);
|
||||
pplc_store.put_prefix_postings_list(writer, prev_prefix, &pls)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -8,11 +8,7 @@ use crate::{error::UnsupportedOperation, store, MResult};
|
||||
pub fn apply_schema_update(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
new_schema: &Schema,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
) -> MResult<()> {
|
||||
use UnsupportedOperation::{
|
||||
CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
|
||||
@ -21,7 +17,7 @@ pub fn apply_schema_update(
|
||||
|
||||
let mut need_full_reindexing = false;
|
||||
|
||||
if let Some(old_schema) = main_store.schema(writer)? {
|
||||
if let Some(old_schema) = index.main.schema(writer)? {
|
||||
for diff in meilisearch_schema::diff(&old_schema, new_schema) {
|
||||
match diff {
|
||||
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
|
||||
@ -45,17 +41,10 @@ pub fn apply_schema_update(
|
||||
}
|
||||
}
|
||||
|
||||
main_store.put_schema(writer, new_schema)?;
|
||||
index.main.put_schema(writer, new_schema)?;
|
||||
|
||||
if need_full_reindexing {
|
||||
reindex_all_documents(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
)?
|
||||
reindex_all_documents(writer, index)?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -63,14 +52,13 @@ pub fn apply_schema_update(
|
||||
|
||||
pub fn push_schema_update(
|
||||
writer: &mut heed::RwTxn<UpdateT>,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
index: &store::Index,
|
||||
schema: Schema,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
let last_update_id = next_update_id(writer, index.updates, index.updates_results)?;
|
||||
|
||||
let update = Update::schema(schema);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
index.updates.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
@ -63,11 +63,7 @@ pub fn push_stop_words_deletion(
|
||||
|
||||
pub fn apply_stop_words_deletion(
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
index: &store::Index,
|
||||
deletion: BTreeSet<String>,
|
||||
) -> MResult<()> {
|
||||
let mut stop_words_builder = SetBuilder::memory();
|
||||
@ -83,7 +79,7 @@ pub fn apply_stop_words_deletion(
|
||||
.unwrap();
|
||||
|
||||
// now we delete all of these stop words from the main store
|
||||
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(&stop_words_fst)
|
||||
@ -97,20 +93,13 @@ pub fn apply_stop_words_deletion(
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||
index.main.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||
|
||||
// now that we have setup the stop words
|
||||
// lets reindex everything...
|
||||
if let Ok(number) = main_store.number_of_documents(writer) {
|
||||
if let Ok(number) = index.main.number_of_documents(writer) {
|
||||
if number > 0 {
|
||||
reindex_all_documents(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
)?;
|
||||
reindex_all_documents(writer, index)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -170,8 +170,6 @@ impl<'a> SearchBuilder<'a> {
|
||||
let ranked_map = ranked_map.map_err(|e| Error::Internal(e.to_string()))?;
|
||||
let ranked_map = ranked_map.unwrap_or_default();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Change criteria
|
||||
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
|
||||
Some(criteria) => self.index.query_builder_with_criteria(criteria),
|
||||
@ -222,8 +220,9 @@ impl<'a> SearchBuilder<'a> {
|
||||
|
||||
query_builder.with_fetch_timeout(self.timeout);
|
||||
|
||||
let docs =
|
||||
query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
|
||||
let start = Instant::now();
|
||||
let docs = query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
|
||||
let time_ms = start.elapsed().as_millis() as usize;
|
||||
|
||||
let mut hits = Vec::with_capacity(self.limit);
|
||||
for doc in docs.map_err(|e| Error::SearchDocuments(e.to_string()))? {
|
||||
@ -278,8 +277,6 @@ impl<'a> SearchBuilder<'a> {
|
||||
hits.push(hit);
|
||||
}
|
||||
|
||||
let time_ms = start.elapsed().as_millis() as usize;
|
||||
|
||||
let results = SearchResult {
|
||||
hits,
|
||||
offset: self.offset,
|
||||
|
@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[repr(C)]
|
||||
@ -19,7 +19,7 @@ pub struct DocumentId(pub u64);
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
@ -46,6 +46,8 @@ pub struct DocIndex {
|
||||
/// The order of the field is important because it defines
|
||||
/// the way these structures are ordered between themselves.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||
#[repr(C)]
|
||||
pub struct Highlight {
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
|
Loading…
x
Reference in New Issue
Block a user