Merge pull request #403 from meilisearch/lazy-data-fetching

Criteria lazy data preparation
This commit is contained in:
Clément Renault 2019-12-13 14:57:19 +01:00 committed by GitHub
commit 020cd7f9e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 1881 additions and 2403 deletions

15
Cargo.lock generated
View File

@ -257,6 +257,11 @@ dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "compact_arena"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "const-random" name = "const-random"
version = "0.1.6" version = "0.1.6"
@ -937,6 +942,7 @@ dependencies = [
"bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)",
"compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -946,6 +952,8 @@ dependencies = [
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"meilisearch-schema 0.8.4", "meilisearch-schema 0.8.4",
@ -954,7 +962,7 @@ dependencies = [
"once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
"siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1684,7 +1692,7 @@ dependencies = [
[[package]] [[package]]
name = "sdset" name = "sdset"
version = "0.3.3" version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
@ -2648,6 +2656,7 @@ dependencies = [
"checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c" "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141"
"checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe" "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe"
"checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59" "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59"
"checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5" "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
@ -2798,7 +2807,7 @@ dependencies = [
"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
"checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" "checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
"checksum sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b6d2447743d6c37b6d67af88d9c0f1fc92989e2d9745d9b2f3d305b906a90195" "checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
"checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0" "checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0"

View File

@ -10,12 +10,14 @@ arc-swap = "0.4.3"
bincode = "1.1.4" bincode = "1.1.4"
byteorder = "1.3.2" byteorder = "1.3.2"
chrono = { version = "0.4.9", features = ["serde"] } chrono = { version = "0.4.9", features = ["serde"] }
compact_arena = "0.4.0"
crossbeam-channel = "0.4.0" crossbeam-channel = "0.4.0"
deunicode = "1.0.0" deunicode = "1.0.0"
env_logger = "0.7.0" env_logger = "0.7.0"
fst = { version = "0.3.5", default-features = false } fst = { version = "0.3.5", default-features = false }
hashbrown = { version = "0.6.0", features = ["serde"] } hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.6.1" heed = "0.6.1"
itertools = "0.8.2" # kill me please
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.8" log = "0.4.8"
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
@ -23,7 +25,7 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.4" }
meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" } meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" }
once_cell = "1.2.0" once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] } ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3" sdset = "0.3.6"
serde = { version = "1.0.101", features = ["derive"] } serde = { version = "1.0.101", features = ["derive"] }
serde_json = "1.0.41" serde_json = "1.0.41"
siphasher = "0.3.1" siphasher = "0.3.1"
@ -35,6 +37,7 @@ assert_matches = "1.3"
criterion = "0.3" criterion = "0.3"
csv = "1.0.7" csv = "1.0.7"
indexmap = { version = "1.2.0", features = ["serde-1"] } indexmap = { version = "1.2.0", features = ["serde-1"] }
jemallocator = "0.3.2"
rustyline = { version = "5.0.0", default-features = false } rustyline = { version = "5.0.0", default-features = false }
structopt = "0.3.2" structopt = "0.3.2"
tempfile = "3.1.0" tempfile = "3.1.0"

View File

@ -1,5 +1,5 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet; use std::collections::HashSet;
use std::collections::btree_map::{BTreeMap, Entry};
use std::error::Error; use std::error::Error;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::iter::FromIterator; use std::iter::FromIterator;
@ -15,6 +15,10 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilisearch_core::{Database, Highlight, ProcessedUpdateResult}; use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
use meilisearch_schema::SchemaAttr; use meilisearch_schema::SchemaAttr;
// #[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
struct IndexCommand { struct IndexCommand {
/// The destination where the database must be created. /// The destination where the database must be created.

View File

@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
pub fn build_dfa(query: &str) -> DFA { pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix) build_dfa_with_setting(query, PrefixSetting::NoPrefix)
} }
pub fn build_exact_dfa(query: &str) -> DFA {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
builder.build_dfa(query)
}

View File

@ -1,125 +1,13 @@
mod dfa; mod dfa;
mod query_enhancer; mod query_enhancer;
use std::cmp::Reverse; use meilisearch_tokenizer::is_cjk;
use std::{cmp, vec};
use fst::{IntoStreamer, Streamer}; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
use levenshtein_automata::DFA;
use meilisearch_tokenizer::{is_cjk, split_query_string};
use crate::database::MainT;
use crate::error::MResult;
use crate::store;
use self::dfa::{build_dfa, build_prefix_dfa};
pub use self::query_enhancer::QueryEnhancer; pub use self::query_enhancer::QueryEnhancer;
use self::query_enhancer::QueryEnhancerBuilder; pub use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3; pub const NGRAMS: usize = 3;
pub struct AutomatonProducer {
automatons: Vec<AutomatonGroup>,
}
impl AutomatonProducer {
pub fn new(
reader: &heed::RoTxn<MainT>,
query: &str,
main_store: store::Main,
postings_list_store: store::PostingsLists,
synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) = generate_automatons(
reader,
query,
main_store,
postings_list_store,
synonyms_store,
)?;
Ok((AutomatonProducer { automatons }, query_enhancer))
}
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
self.automatons.into_iter()
}
}
#[derive(Debug)]
pub struct AutomatonGroup {
pub is_phrase_query: bool,
pub automatons: Vec<Automaton>,
}
impl AutomatonGroup {
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: false,
automatons,
}
}
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: true,
automatons,
}
}
}
#[derive(Debug)]
pub struct Automaton {
pub index: usize,
pub ngram: usize,
pub query_len: usize,
pub is_exact: bool,
pub is_prefix: bool,
pub query: String,
}
impl Automaton {
pub fn dfa(&self) -> DFA {
if self.is_prefix {
build_prefix_dfa(&self.query)
} else {
build_dfa(&self.query)
}
}
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: true,
is_prefix: false,
query: query.to_string(),
}
}
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: true,
is_prefix: true,
query: query.to_string(),
}
}
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: false,
is_prefix: false,
query: query.to_string(),
}
}
}
pub fn normalize_str(string: &str) -> String { pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase(); let mut string = string.to_lowercase();
@ -130,167 +18,3 @@ pub fn normalize_str(string: &str) -> String {
string string
} }
fn split_best_frequency<'a>(
reader: &heed::RoTxn<MainT>,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn generate_automatons(
reader: &heed::RoTxn<MainT>,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
Some(synonym) => synonym,
None => fst::Set::default(),
};
let mut automaton_index = 0;
let mut automatons = Vec::new();
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
// We must not declare the original words to the query enhancer
// *but* we need to push them in the automatons list first
let mut original_automatons = Vec::new();
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
let automaton = if not_prefix_dfa {
Automaton::exact(automaton_index, 1, word)
} else {
Automaton::prefix_exact(automaton_index, 1, word)
};
automaton_index += 1;
original_automatons.push(automaton);
}
automatons.push(AutomatonGroup::normal(original_automatons));
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(
query_range.clone(),
real_query_index,
&synonyms_words,
);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {
Automaton::exact(automaton_index, n, synonym)
} else {
Automaton::non_exact(automaton_index, n, synonym)
};
automaton_index += 1;
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
}
if n == 1 {
if let Some((left, right)) =
split_best_frequency(reader, &normalized, postings_lists_store)?
{
let a = Automaton::exact(automaton_index, 1, left);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
let b = Automaton::exact(automaton_index, 1, right);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1;
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
// order automatons, the most important first,
// we keep the original automatons at the front.
automatons[1..].sort_by_key(|group| {
let a = group.automatons.first().unwrap();
(
Reverse(a.is_exact),
a.ngram,
Reverse(group.automatons.len()),
)
});
Ok((automatons, enhancer_builder.build()))
}

View File

@ -58,6 +58,7 @@ where
type Origin = usize; type Origin = usize;
type RealLength = usize; type RealLength = usize;
#[derive(Debug)]
struct FakeIntervalTree { struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>, intervals: Vec<(Range<usize>, (Origin, RealLength))>,
} }
@ -142,67 +143,80 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
// we need to pad real query indices // we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len()); let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len(); let real_length = replacement.len();
self.real_to_origin self.real_to_origin.push((real_range, (range.start, real_length)));
.push((real_range, (range.start, real_length)));
} }
pub fn build(self) -> QueryEnhancer { pub fn build(self) -> QueryEnhancer {
QueryEnhancer { let interval_tree = FakeIntervalTree::new(self.real_to_origin);
origins: self.origins, let mut table = Vec::new();
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
for real in 0.. {
match replacement(&self.origins, &interval_tree, real) {
Some(range) => table.push(range),
None => break,
}
} }
QueryEnhancer { table }
} }
} }
/// Returns the query indices that represent this real query index.
fn replacement(
origins: &[usize],
real_to_origin: &FakeIntervalTree,
real: u32,
) -> Option<Range<u32>>
{
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) = real_to_origin.query(real)?;
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
let start = origins[origin];
let end = origins.get(new_origin + 1)?;
let remaining = (end - start) - n;
Some(Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
})
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = origins[origin];
Some(Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
})
}
}
#[derive(Debug)]
pub struct QueryEnhancer { pub struct QueryEnhancer {
origins: Vec<usize>, table: Vec<Range<u32>>,
real_to_origin: FakeIntervalTree,
} }
impl QueryEnhancer { impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index. /// Returns the query indices that represent this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> { pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize; self.table[real as usize].clone()
// query the fake interval tree with the real query index
let (range, (origin, real_length)) = self
.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
}
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
}
}
} }
} }

View File

@ -0,0 +1,717 @@
use std::ops::Deref;
use std::{cmp, fmt};
use std::borrow::Cow;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Duration, Instant};
use compact_arena::{SmallArena, Idx32, mk_arena};
use fst::{IntoStreamer, Streamer};
use hashbrown::HashMap;
use levenshtein_automata::DFA;
use log::debug;
use meilisearch_tokenizer::{is_cjk, split_query_string};
use meilisearch_types::DocIndex;
use sdset::{Set, SetBuf};
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::NGRAMS;
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::normalize_str;
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
use crate::criterion::{Criteria, Context, ContextMut};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::RawDocument;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
pub fn bucket_sort<'c, FI>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
{
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if filter.is_some() {
let distinct = |_| None;
let distinct_size = 1;
return bucket_sort_with_distinct(
reader,
query,
range,
filter,
distinct,
distinct_size,
criteria,
searchable_attrs,
main_store,
postings_lists_store,
documents_fields_counts_store,
synonyms_store,
);
}
let (mut automatons, mut query_enhancer) =
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
debug!("{:?}", query_enhancer);
let before_postings_lists_fetching = Instant::now();
mk_arena!(arena);
let mut bare_matches =
fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
debug!("bare matches ({}) retrieved in {:.02?}",
bare_matches.len(),
before_postings_lists_fetching.elapsed(),
);
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
let before_raw_documents_building = Instant::now();
let mut prefiltered_documents = 0;
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
prefiltered_documents += 1;
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
raw_documents.push(raw_document);
}
}
debug!("creating {} (original {}) candidates documents took {:.02?}",
raw_documents.len(),
prefiltered_documents,
before_raw_documents_building.elapsed(),
);
let mut groups = vec![raw_documents.as_mut_slice()];
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for mut group in tmp_groups {
let before_criterion_preparation = Instant::now();
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_enhancer: &mut query_enhancer,
automatons: &mut automatons,
documents_fields_counts_store,
};
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_enhancer: &query_enhancer,
automatons: &automatons,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref()));
Ok(iter.collect())
}
pub fn bucket_sort_with_distinct<'c, FI, FD>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
distinct: FD,
distinct_size: usize,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{
let (mut automatons, mut query_enhancer) =
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
let before_postings_lists_fetching = Instant::now();
mk_arena!(arena);
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
debug!("bare matches ({}) retrieved in {:.02?}",
bare_matches.len(),
before_postings_lists_fetching.elapsed(),
);
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
let before_raw_documents_building = Instant::now();
let mut prefiltered_documents = 0;
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
prefiltered_documents += 1;
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
raw_documents.push(raw_document);
}
}
debug!("creating {} (original {}) candidates documents took {:.02?}",
raw_documents.len(),
prefiltered_documents,
before_raw_documents_building.elapsed(),
);
let mut groups = vec![raw_documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(distinct_size);
let mut distinct_raw_offset = 0;
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for mut group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_enhancer: &mut query_enhancer,
automatons: &mut automatons,
documents_fields_counts_store,
};
let before_criterion_preparation = Instant::now();
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_enhancer: &query_enhancer,
automatons: &automatons,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
}
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end {
break;
}
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end {
continue 'criteria;
}
}
}
}
// once we classified the documents related to the current
// automatons we save that as the next valid result
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
let mut documents = Vec::with_capacity(range.len());
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &filter {
Some(_) => filter_map.remove(&raw_document.id).unwrap(),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&raw_document.id).unwrap();
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref()));
if documents.len() == range.len() {
break;
}
}
}
}
Ok(documents)
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: u16,
pub distance: u8,
pub is_exact: bool,
pub postings_list: Idx32<'tag>,
}
impl fmt::Debug for BareMatch<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BareMatch")
.field("document_id", &self.document_id)
.field("query_index", &self.query_index)
.field("distance", &self.distance)
.field("is_exact", &self.is_exact)
.finish()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch {
pub query_index: u16,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Clone)]
pub enum PostingsListView<'txn> {
Original {
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
},
Rewritten {
input: Rc<[u8]>,
postings_list: SetBuf<DocIndex>,
},
}
impl fmt::Debug for PostingsListView<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PostingsListView")
.field("input", &std::str::from_utf8(&self.input()).unwrap())
.field("postings_list", &self.as_ref())
.finish()
}
}
impl<'txn> PostingsListView<'txn> {
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView::Original { input, postings_list, offset: 0, len }
}
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
PostingsListView::Rewritten { input, postings_list }
}
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
let input = match self {
PostingsListView::Original { input, .. } => input.clone(),
PostingsListView::Rewritten { input, .. } => input.clone(),
};
*self = PostingsListView::rewritten(input, postings_list);
}
pub fn len(&self) -> usize {
match self {
PostingsListView::Original { len, .. } => *len,
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
}
}
pub fn input(&self) -> &[u8] {
match self {
PostingsListView::Original { ref input, .. } => input,
PostingsListView::Rewritten { ref input, .. } => input,
}
}
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
match self {
PostingsListView::Original { input, postings_list, offset, len } => {
assert!(range_offset + range_len <= *len);
PostingsListView::Original {
input: input.clone(),
postings_list: postings_list.clone(),
offset: offset + range_offset,
len: range_len,
}
},
PostingsListView::Rewritten { .. } => {
panic!("Cannot create a range on a rewritten postings list view");
}
}
}
}
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
self
}
}
impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
match *self {
PostingsListView::Original { ref postings_list, offset, len, .. } => {
Set::new_unchecked(&postings_list[offset..offset + len])
},
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
}
}
}
fn fetch_matches<'txn, 'tag>(
reader: &'txn heed::RoTxn<MainT>,
automatons: &[QueryWordAutomaton],
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
) -> MResult<Vec<BareMatch<'tag>>>
{
let before_words_fst = Instant::now();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
debug!("words fst took {:.02?}", before_words_fst.elapsed());
let mut total_postings_lists = Vec::new();
let mut dfa_time = Duration::default();
let mut stream_next_time = Duration::default();
let mut postings_lists_fetching_time = Duration::default();
for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now();
let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, .. } = automaton;
dfa_time += before_dfa.elapsed();
let mut number_of_words = 0;
let mut stream = words.search(&dfa).into_stream();
// while let Some(input) = stream.next() {
loop {
let before_stream_next = Instant::now();
let input = match stream.next() {
Some(input) => input,
None => break,
};
stream_next_time += before_stream_next.elapsed();
number_of_words += 1;
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
let before_postings_lists_fetching = Instant::now();
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
let input = Rc::from(input);
let postings_list = Rc::new(postings_list);
let postings_list_view = PostingsListView::original(input, postings_list);
let mut offset = 0;
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
let document_id = group[0].document_id;
let bare_match = BareMatch {
document_id,
query_index: query_index as u16,
distance,
is_exact,
postings_list: posting_list_index,
};
total_postings_lists.push(bare_match);
offset += group.len();
}
}
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
}
debug!("{:?} gives {} words", query, number_of_words);
}
debug!("stream next took {:.02?}", stream_next_time);
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
debug!("dfa creation took {:.02?}", dfa_time);
Ok(total_postings_lists)
}
#[derive(Debug)]
pub struct QueryWordAutomaton {
pub query: String,
/// Is it a word that must be considered exact
/// or is it some derived word (i.e. a synonym)
pub is_exact: bool,
pub is_prefix: bool,
/// If it's a phrase query and what is
/// its index an the length of the phrase
pub phrase_query: Option<(u16, u16)>,
}
impl QueryWordAutomaton {
pub fn exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: false,
phrase_query: None,
}
}
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: true,
phrase_query: None,
}
}
pub fn non_exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: false,
is_prefix: false,
phrase_query: None,
}
}
pub fn dfa(&self) -> DFA {
if self.phrase_query.is_some() {
build_exact_dfa(&self.query)
} else if self.is_prefix {
build_prefix_dfa(&self.query)
} else {
build_dfa(&self.query)
}
}
}
fn split_best_frequency<'a>(
reader: &heed::RoTxn<MainT>,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn construct_automatons(
reader: &heed::RoTxn<MainT>,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
Some(synonym) => synonym,
None => fst::Set::default(),
};
let mut automaton_index = 0;
let mut automatons = Vec::new();
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
// We must not declare the original words to the query enhancer
// *but* we need to push them in the automatons list first
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
let automaton = if not_prefix_dfa {
QueryWordAutomaton::exact(word)
} else {
QueryWordAutomaton::exact_prefix(word)
};
automaton_index += 1;
automatons.push(automaton);
}
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {
QueryWordAutomaton::exact(synonym)
} else {
QueryWordAutomaton::non_exact(synonym)
};
automaton_index += 1;
automatons.push(automaton);
}
}
}
}
if n == 1 {
// automatons for splitted words
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(left_automaton);
let mut right_automaton = QueryWordAutomaton::exact(right);
right_automaton.phrase_query = Some((1, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
automaton_index += 1;
automatons.push(right_automaton);
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
let automaton = QueryWordAutomaton::exact(&normalized);
automaton_index += 1;
automatons.push(automaton);
}
}
}
Ok((automatons, enhancer_builder.build()))
}

View File

@ -0,0 +1,37 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::SimpleMatch;
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_of_attribute(matches: &[SimpleMatch]) -> usize {
let mut sum_of_attribute = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_of_attribute += group[0].attribute as usize;
}
sum_of_attribute
}
let lhs = sum_of_attribute(&lhs.processed_matches);
let rhs = sum_of_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,16 +1,16 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use std::cmp::Ordering; use std::cmp::Ordering;
use crate::RawDocument;
use super::{Criterion, Context};
#[derive(Debug, Clone, Copy)]
pub struct DocumentId; pub struct DocumentId;
impl Criterion for DocumentId { impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { fn name(&self) -> &str { "stable document id" }
lhs.id.cmp(&rhs.id)
}
fn name(&self) -> &str { fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
"DocumentId" let lhs = &lhs.id;
let rhs = &rhs.id;
lhs.cmp(rhs)
} }
} }

View File

@ -1,132 +1,78 @@
use std::cmp::Ordering; use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::{HashMap, Entry};
use meilisearch_schema::SchemaAttr; use meilisearch_schema::SchemaAttr;
use sdset::Set;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::BareMatch;
use super::{Criterion, Context, ContextMut};
use crate::criterion::Criterion; pub struct Exact;
use crate::RawDocument;
#[inline] impl Criterion for Exact {
fn number_exact_matches( fn name(&self) -> &str { "exact" }
query_index: &[u32],
attribute: &[u16],
is_exact: &[bool],
fields_counts: &Set<(SchemaAttr, u64)>,
) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() { fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
let len = group.len(); &self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
let store = ctx.documents_fields_counts_store;
let reader = ctx.reader;
let mut found_exact = false; 'documents: for doc in documents {
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { doc.bare_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
if *is_exact {
found_exact = true; // mark the document if we find a "one word field" that matches
let attr = &attribute[index + pos]; let mut fields_counts = HashMap::new();
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) { for group in doc.bare_matches.linear_group_by_key(|bm| bm.query_index) {
let (_, count) = fields_counts[pos]; for group in group.linear_group_by_key(|bm| bm.is_exact) {
if count == 1 { if !group[0].is_exact { break }
return usize::max_value();
for bm in group {
for di in ctx.postings_lists[bm.postings_list].as_ref() {
let attr = SchemaAttr(di.attribute);
let count = match fields_counts.entry(attr) {
Entry::Occupied(entry) => *entry.get(),
Entry::Vacant(entry) => {
let count = store.document_field_count(reader, doc.id, attr)?;
*entry.insert(count)
},
};
if count == Some(1) {
doc.contains_one_word_field = true;
continue 'documents
}
}
} }
} }
} }
} }
count += found_exact as usize; Ok(())
index += len;
} }
count fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
} #[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
#[derive(Debug, Clone, Copy)] for group in matches.linear_group_by_key(|bm| bm.query_index) {
pub struct Exact; sum_exact_query_words += group[0].is_exact as usize;
}
impl Criterion for Exact { sum_exact_query_words
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { }
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
let attribute = lhs.attribute();
let fields_counts = &lhs.fields_counts;
number_exact_matches(query_index, attribute, is_exact, fields_counts) // does it contains a "one word field"
}; lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse()
// if not, with document contains the more exact words
let rhs = { .then_with(|| {
let query_index = rhs.query_index(); let lhs = sum_exact_query_words(&lhs.bare_matches);
let is_exact = rhs.is_exact(); let rhs = sum_exact_query_words(&rhs.bare_matches);
let attribute = rhs.attribute(); lhs.cmp(&rhs).reverse()
let fields_counts = &rhs.fields_counts; })
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "soulier"
//
// doc0: { 0. "soulier" }
// doc1: { 0. "soulier bleu et blanc" }
#[test]
fn basic() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
} }
} }

View File

@ -1,59 +1,75 @@
mod document_id; use std::cmp::{self, Ordering};
use compact_arena::SmallArena;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{store, RawDocument, MResult};
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
use crate::database::MainT;
mod typo;
mod words;
mod proximity;
mod attribute;
mod words_position;
mod exact; mod exact;
mod number_of_words; mod document_id;
mod sort_by_attr; mod sort_by_attr;
mod sum_of_typos;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod words_proximity;
use crate::RawDocument; pub use self::typo::Typo;
use std::cmp::Ordering; pub use self::words::Words;
pub use self::proximity::Proximity;
pub use self::{ pub use self::attribute::Attribute;
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, pub use self::words_position::WordsPosition;
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, pub use self::exact::Exact;
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, pub use self::document_id::DocumentId;
words_proximity::WordsProximity, pub use self::sort_by_attr::SortByAttr;
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
pub trait Criterion {
fn name(&self) -> &str; fn name(&self) -> &str;
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
_documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
Ok(())
}
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> Ordering;
#[inline] #[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
self.evaluate(lhs, rhs) == Ordering::Equal &self,
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> bool
{
self.evaluate(ctx, lhs, rhs) == Ordering::Equal
} }
} }
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { pub reader: &'h heed::RoTxn<MainT>,
(**self).evaluate(lhs, rhs) pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
} pub query_enhancer: &'q mut QueryEnhancer,
pub automatons: &'a mut [QueryWordAutomaton],
fn name(&self) -> &str { pub documents_fields_counts_store: store::DocumentsFieldsCounts,
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
} }
impl<T: Criterion + ?Sized> Criterion for Box<T> { pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
(**self).evaluate(lhs, rhs) pub query_enhancer: &'q QueryEnhancer,
} pub automatons: &'a [QueryWordAutomaton],
fn name(&self) -> &str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
} }
#[derive(Default)] #[derive(Default)]
@ -103,11 +119,11 @@ pub struct Criteria<'a> {
impl<'a> Default for Criteria<'a> { impl<'a> Default for Criteria<'a> {
fn default() -> Self { fn default() -> Self {
CriteriaBuilder::with_capacity(7) CriteriaBuilder::with_capacity(7)
.add(SumOfTypos) .add(Typo)
.add(NumberOfWords) .add(Words)
.add(WordsProximity) .add(Proximity)
.add(SumOfWordsAttribute) .add(Attribute)
.add(SumOfWordsPosition) .add(WordsPosition)
.add(Exact) .add(Exact)
.add(DocumentId) .add(DocumentId)
.build() .build()
@ -119,3 +135,162 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
&self.inner &self.inner
} }
} }
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_enhancer: &QueryEnhancer,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
if postings_lists[m.postings_list].is_empty() { continue }
let range = query_enhancer.replacement(m.query_index as u32);
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
document.processed_distances = processed;
}
}
fn prepare_bare_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
document.processed_matches = processed.into_vec();
}
}
fn multiword_rewrite_matches(
matches: &mut [SimpleMatch],
query_enhancer: &QueryEnhancer,
) -> SetBuf<SimpleMatch>
{
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
let query_index = rep.next().unwrap() as u16;
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@ -1,31 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"NumberOfWords"
}
}

View File

@ -0,0 +1,68 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::bucket_sort::{SimpleMatch};
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
const MAX_DISTANCE: u16 = 8;
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,10 +1,9 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use crate::criterion::Criterion;
use crate::{RankedMap, RawDocument};
use meilisearch_schema::{Schema, SchemaAttr}; use meilisearch_schema::{Schema, SchemaAttr};
use crate::{RankedMap, RawDocument};
use super::{Criterion, Context};
/// An helper struct that permit to sort documents by /// An helper struct that permit to sort documents by
/// some of their stored attributes. /// some of their stored attributes.
@ -28,11 +27,11 @@ use meilisearch_schema::{Schema, SchemaAttr};
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
/// ///
/// let builder = CriteriaBuilder::with_capacity(8) /// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos) /// .add(Typo)
/// .add(NumberOfWords) /// .add(Words)
/// .add(WordsProximity) /// .add(Proximity)
/// .add(SumOfWordsAttribute) /// .add(Attribute)
/// .add(SumOfWordsPosition) /// .add(WordsPosition)
/// .add(Exact) /// .add(Exact)
/// .add(custom_ranking) /// .add(custom_ranking)
/// .add(DocumentId); /// .add(DocumentId);
@ -86,8 +85,12 @@ impl<'a> SortByAttr<'a> {
} }
} }
impl<'a> Criterion for SortByAttr<'a> { impl Criterion for SortByAttr<'_> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { fn name(&self) -> &str {
"sort by attribute"
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.attr); let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr); let rhs = self.ranked_map.get(rhs.id, self.attr);
@ -105,10 +108,6 @@ impl<'a> Criterion for SortByAttr<'a> {
(None, None) => Ordering::Equal, (None, None) => Ordering::Equal,
} }
} }
fn name(&self) -> &str {
"SortByAttr"
}
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

View File

@ -1,116 +0,0 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"SumOfTypos"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -0,0 +1,55 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -0,0 +1,31 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -0,0 +1,37 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::bucket_sort::SimpleMatch;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,164 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::{self, Ordering};
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr {
return MAX_DISTANCE;
}
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16 {
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len]
.linear_group()
.next()
.unwrap()
.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"WordsProximity"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
17
);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
3
);
}
}

View File

@ -3,7 +3,7 @@
extern crate assert_matches; extern crate assert_matches;
mod automaton; mod automaton;
pub mod criterion; mod bucket_sort;
mod database; mod database;
mod distinct_map; mod distinct_map;
mod error; mod error;
@ -12,11 +12,12 @@ mod number;
mod query_builder; mod query_builder;
mod ranked_map; mod ranked_map;
mod raw_document; mod raw_document;
pub mod raw_indexer;
mod reordered_attrs; mod reordered_attrs;
mod update;
pub mod criterion;
pub mod raw_indexer;
pub mod serde; pub mod serde;
pub mod store; pub mod store;
mod update;
pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
pub use self::error::{Error, MResult}; pub use self::error::{Error, MResult};
@ -27,61 +28,105 @@ pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
#[doc(hidden)] use compact_arena::SmallArena;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
pub struct TmpMatch { use crate::levenshtein::prefix_damerau_levenshtein;
pub query_index: u32, use crate::reordered_attrs::ReorderedAttrs;
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Document { pub struct Document {
pub id: DocumentId, pub id: DocumentId,
pub highlights: Vec<Highlight>, pub highlights: Vec<Highlight>,
#[cfg(test)] #[cfg(test)]
pub matches: Vec<TmpMatch>, pub matches: Vec<crate::bucket_sort::SimpleMatch>,
}
fn highlights_from_raw_document<'a, 'tag, 'txn>(
raw_document: &RawDocument<'a, 'tag>,
automatons: &[QueryWordAutomaton],
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
) -> Vec<Highlight>
{
let mut highlights = Vec::new();
for bm in raw_document.bare_matches.iter() {
let postings_list = &arena[bm.postings_list];
let input = postings_list.input();
let query = &automatons[bm.query_index as usize].query;
for di in postings_list.iter() {
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
let attribute = searchable_attrs
.and_then(|sa| sa.reverse(di.attribute))
.unwrap_or(di.attribute);
let highlight = Highlight {
attribute: attribute,
char_index: di.char_index,
char_length: covered_area as u16,
};
highlights.push(highlight);
}
}
highlights
} }
impl Document { impl Document {
#[cfg(not(test))] #[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document { pub fn from_raw<'a, 'tag, 'txn>(
Document { raw_document: RawDocument<'a, 'tag>,
id: raw.id, automatons: &[QueryWordAutomaton],
highlights: raw.highlights, arena: &SmallArena<'tag, PostingsListView<'txn>>,
} searchable_attrs: Option<&ReorderedAttrs>,
) -> Document
{
let highlights = highlights_from_raw_document(
&raw_document,
automatons,
arena,
searchable_attrs,
);
Document { id: raw_document.id, highlights }
} }
#[cfg(test)] #[cfg(test)]
fn from_raw(raw: RawDocument) -> Document { pub fn from_raw<'a, 'tag, 'txn>(
let len = raw.query_index().len(); raw_document: RawDocument<'a, 'tag>,
let mut matches = Vec::with_capacity(len); automatons: &[QueryWordAutomaton],
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
) -> Document
{
use crate::bucket_sort::SimpleMatch;
let query_index = raw.query_index(); let highlights = highlights_from_raw_document(
let distance = raw.distance(); &raw_document,
let attribute = raw.attribute(); automatons,
let word_index = raw.word_index(); arena,
let is_exact = raw.is_exact(); searchable_attrs,
);
for i in 0..len { let mut matches = Vec::new();
let match_ = TmpMatch { for sm in raw_document.processed_matches {
query_index: query_index[i], let attribute = searchable_attrs
distance: distance[i], .and_then(|sa| sa.reverse(sm.attribute))
attribute: attribute[i], .unwrap_or(sm.attribute);
word_index: word_index[i],
is_exact: is_exact[i], matches.push(SimpleMatch { attribute, ..sm });
};
matches.push(match_);
} }
matches.sort_unstable();
Document { Document { id: raw_document.id, highlights, matches }
id: raw.id,
matches,
highlights: raw.highlights,
}
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,398 +0,0 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -1,186 +1,111 @@
use std::fmt; use compact_arena::SmallArena;
use std::sync::Arc; use itertools::EitherOrBoth;
use meilisearch_schema::SchemaAttr;
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::GroupBy; use crate::DocIndex;
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
use crate::reordered_attrs::ReorderedAttrs;
use crate::{DocumentId, Highlight, TmpMatch}; pub struct RawDocument<'a, 'tag> {
pub id: crate::DocumentId,
#[derive(Clone)] pub bare_matches: &'a mut [BareMatch<'tag>],
pub struct RawDocument { pub processed_matches: Vec<SimpleMatch>,
pub id: DocumentId, /// The list of minimum `distance` found
pub matches: SharedMatches, pub processed_distances: Vec<Option<u8>>,
pub highlights: Vec<Highlight>, /// Does this document contains a field
pub fields_counts: SetBuf<(SchemaAttr, u64)>, /// with one word that is exactly matching
pub contains_one_word_field: bool,
} }
impl RawDocument { impl<'a, 'tag> RawDocument<'a, 'tag> {
pub fn query_index(&self) -> &[u32] { pub fn new<'txn>(
let r = self.matches.range; bare_matches: &'a mut [BareMatch<'tag>],
// it is safe because construction/modifications automatons: &[QueryWordAutomaton],
// can only be done in this module postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
unsafe { searchable_attrs: Option<&ReorderedAttrs>,
&self ) -> Option<RawDocument<'a, 'tag>>
.matches {
.matches if let Some(reordered_attrs) = searchable_attrs {
.query_index for bm in bare_matches.iter() {
.get_unchecked(r.start..r.end) let postings_list = &postings_lists[bm.postings_list];
}
}
pub fn distance(&self) -> &[u8] { let mut rewritten = Vec::new();
let r = self.matches.range; for di in postings_list.iter() {
// it is safe because construction/modifications if let Some(attribute) = reordered_attrs.get(di.attribute) {
// can only be done in this module rewritten.push(DocIndex { attribute, ..*di });
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } }
} }
pub fn attribute(&self) -> &[u16] { let new_postings = SetBuf::from_dirty(rewritten);
let r = self.matches.range; postings_lists[bm.postings_list].rewrite_with(new_postings);
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe {
&self
.matches
.matches
.word_index
.get_unchecked(r.start..r.end)
}
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"query_index",
self.query_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"distance",
self.distance()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"attribute",
self.attribute()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"word_index",
self.word_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"is_exact",
self.is_exact()
))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
) -> Vec<RawDocument> {
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges
.into_iter()
.map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches {
range,
matches: matches.clone(),
};
RawDocument {
id,
matches,
highlights,
fields_counts,
} }
}
bare_matches.sort_unstable_by_key(|m| m.query_index);
let mut previous_word = None;
for i in 0..bare_matches.len() {
let a = &bare_matches[i];
let auta = &automatons[a.query_index as usize];
match auta.phrase_query {
Some((0, _)) => {
let b = match bare_matches.get(i + 1) {
Some(b) => b,
None => {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue;
}
};
if a.query_index + 1 != b.query_index {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue
}
let pla = &postings_lists[a.postings_list];
let plb = &postings_lists[b.postings_list];
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
});
let mut newa = Vec::new();
let mut newb = Vec::new();
for eb in iter {
if let EitherOrBoth::Both(a, b) = eb {
newa.push(*a);
newb.push(*b);
}
}
if !newa.is_empty() {
previous_word = Some(a.query_index);
}
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
},
Some((1, _)) => {
if previous_word.take() != Some(a.query_index - 1) {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
}
},
Some((_, _)) => unreachable!(),
None => (),
}
}
if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
return None
}
Some(RawDocument {
id: bare_matches[0].document_id,
bare_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
contains_one_word_field: false,
}) })
.collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
} }
} }

View File

@ -1,27 +1,31 @@
use std::cmp;
#[derive(Default, Clone)] #[derive(Default, Clone)]
pub struct ReorderedAttrs { pub struct ReorderedAttrs {
count: usize,
reorders: Vec<Option<u16>>, reorders: Vec<Option<u16>>,
reverse: Vec<u16>,
} }
impl ReorderedAttrs { impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs { pub fn new() -> ReorderedAttrs {
ReorderedAttrs { ReorderedAttrs { reorders: Vec::new(), reverse: Vec::new() }
count: 0,
reorders: Vec::new(),
}
} }
pub fn insert_attribute(&mut self, attribute: u16) { pub fn insert_attribute(&mut self, attribute: u16) {
self.reorders.resize(attribute as usize + 1, None); let new_len = cmp::max(attribute as usize + 1, self.reorders.len());
self.reorders[attribute as usize] = Some(self.count as u16); self.reorders.resize(new_len, None);
self.count += 1; self.reorders[attribute as usize] = Some(self.reverse.len() as u16);
self.reverse.push(attribute);
} }
pub fn get(&self, attribute: u16) -> Option<u16> { pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize) { match self.reorders.get(attribute as usize)? {
Some(Some(attribute)) => Some(*attribute), Some(attribute) => Some(*attribute),
_ => None, None => None,
} }
} }
pub fn reverse(&self, attribute: u16) -> Option<u16> {
self.reverse.get(attribute as usize).copied()
}
} }

View File

@ -325,7 +325,7 @@ where
txn, txn,
document_id, document_id,
attribute, attribute,
number_of_words as u64, number_of_words as u16,
)?; )?;
} }
} }

View File

@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts { pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>, pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u16>>,
} }
impl DocumentsFieldsCounts { impl DocumentsFieldsCounts {
@ -16,7 +16,7 @@ impl DocumentsFieldsCounts {
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
value: u64, value: u16,
) -> ZResult<()> { ) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value) self.documents_fields_counts.put(writer, &key, &value)
@ -42,7 +42,7 @@ impl DocumentsFieldsCounts {
reader: &heed::RoTxn<MainT>, reader: &heed::RoTxn<MainT>,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> ZResult<Option<u64>> { ) -> ZResult<Option<u16>> {
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? { match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)), Some(count) => Ok(Some(count)),
@ -79,11 +79,11 @@ impl DocumentsFieldsCounts {
} }
pub struct DocumentFieldsCountsIter<'txn> { pub struct DocumentFieldsCountsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
} }
impl Iterator for DocumentFieldsCountsIter<'_> { impl Iterator for DocumentFieldsCountsIter<'_> {
type Item = ZResult<(SchemaAttr, u64)>; type Item = ZResult<(SchemaAttr, u16)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() { match self.iter.next() {
@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> { pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>, last_seen_id: Option<DocumentId>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
} }
impl Iterator for DocumentsIdsIter<'_> { impl Iterator for DocumentsIdsIter<'_> {
@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> {
} }
pub struct AllDocumentsFieldsCountsIter<'txn> { pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
} }
impl Iterator for AllDocumentsFieldsCountsIter<'_> { impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>; type Item = ZResult<(DocumentId, SchemaAttr, u16)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() { match self.iter.next() {

View File

@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> {
if let Some(ranking_rules_order) = ranking_order { if let Some(ranking_rules_order) = ranking_order {
for rule in ranking_rules_order { for rule in ranking_rules_order {
match rule.as_str() { match rule.as_str() {
"_sum_of_typos" => builder.push(SumOfTypos), "_typo" => builder.push(Typo),
"_number_of_words" => builder.push(NumberOfWords), "_words" => builder.push(Words),
"_word_proximity" => builder.push(WordsProximity), "_proximity" => builder.push(Proximity),
"_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), "_attribute" => builder.push(Attribute),
"_sum_of_words_position" => builder.push(SumOfWordsPosition), "_words_position" => builder.push(WordsPosition),
"_exact" => builder.push(Exact), "_exact" => builder.push(Exact),
_ => { _ => {
let order = match ranking_rules.get(rule.as_str()) { let order = match ranking_rules.get(rule.as_str()) {
@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> {
builder.push(DocumentId); builder.push(DocumentId);
return Ok(Some(builder.build())); return Ok(Some(builder.build()));
} else { } else {
builder.push(SumOfTypos); builder.push(Typo);
builder.push(NumberOfWords); builder.push(Words);
builder.push(WordsProximity); builder.push(Proximity);
builder.push(SumOfWordsAttribute); builder.push(Attribute);
builder.push(SumOfWordsPosition); builder.push(WordsPosition);
builder.push(Exact); builder.push(Exact);
for (rule, order) in ranking_rules.iter() { for (rule, order) in ranking_rules.iter() {
let custom_ranking = match order { let custom_ranking = match order {