diff --git a/Cargo.lock b/Cargo.lock index 2dedeb04a..750cdc30c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -257,6 +257,11 @@ dependencies = [ "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "compact_arena" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "const-random" version = "0.1.6" @@ -937,6 +942,7 @@ dependencies = [ "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", + "compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -946,6 +952,8 @@ dependencies = [ "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", + "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "meilisearch-schema 0.8.4", @@ -954,7 +962,7 @@ dependencies = [ "once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1684,7 +1692,7 @@ dependencies = [ [[package]] name = "sdset" -version = "0.3.3" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -2648,6 +2656,7 @@ dependencies = [ "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141" "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe" "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59" "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5" @@ -2798,7 +2807,7 @@ dependencies = [ "checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -"checksum sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b6d2447743d6c37b6d67af88d9c0f1fc92989e2d9745d9b2f3d305b906a90195" +"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index a268c6605..3b19369f8 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -10,12 +10,14 @@ arc-swap = "0.4.3" bincode = "1.1.4" byteorder = "1.3.2" chrono = { version = "0.4.9", features = ["serde"] } +compact_arena = "0.4.0" crossbeam-channel = "0.4.0" deunicode = "1.0.0" env_logger = "0.7.0" fst = { version = "0.3.5", default-features = false } hashbrown = { version = "0.6.0", features = ["serde"] } heed = "0.6.1" +itertools = "0.8.2" # kill me please levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.8" meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } @@ -23,7 +25,7 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.4" } meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" } once_cell = "1.2.0" ordered-float = { version = "1.0.2", features = ["serde"] } -sdset = "0.3.3" +sdset = "0.3.6" serde = { version = "1.0.101", features = ["derive"] } serde_json = "1.0.41" siphasher = "0.3.1" @@ -35,6 +37,7 @@ assert_matches = "1.3" criterion = "0.3" csv = "1.0.7" indexmap = { version = "1.2.0", features = ["serde-1"] } +jemallocator = "0.3.2" rustyline = { version = "5.0.0", default-features = false } structopt = "0.3.2" tempfile = "3.1.0" diff --git a/meilisearch-core/examples/from_file.rs b/meilisearch-core/examples/from_file.rs index dff8d1b2a..c0b50362c 100644 --- a/meilisearch-core/examples/from_file.rs +++ b/meilisearch-core/examples/from_file.rs @@ -1,5 +1,5 @@ -use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; +use std::collections::btree_map::{BTreeMap, Entry}; use std::error::Error; use std::io::{Read, Write}; use std::iter::FromIterator; @@ -15,6 +15,10 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use meilisearch_core::{Database, Highlight, ProcessedUpdateResult}; use meilisearch_schema::SchemaAttr; +// #[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + #[derive(Debug, StructOpt)] struct IndexCommand { /// The destination where the database must be created. diff --git a/meilisearch-core/src/automaton/dfa.rs b/meilisearch-core/src/automaton/dfa.rs index 6258da424..da1a6eb39 100644 --- a/meilisearch-core/src/automaton/dfa.rs +++ b/meilisearch-core/src/automaton/dfa.rs @@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA { pub fn build_dfa(query: &str) -> DFA { build_dfa_with_setting(query, PrefixSetting::NoPrefix) } + +pub fn build_exact_dfa(query: &str) -> DFA { + let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); + builder.build_dfa(query) +} diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index a803eee8e..ef9bf5324 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,125 +1,13 @@ mod dfa; mod query_enhancer; -use std::cmp::Reverse; -use std::{cmp, vec}; +use meilisearch_tokenizer::is_cjk; -use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::DFA; -use meilisearch_tokenizer::{is_cjk, split_query_string}; - -use crate::database::MainT; -use crate::error::MResult; -use crate::store; - -use self::dfa::{build_dfa, build_prefix_dfa}; +pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::query_enhancer::QueryEnhancer; -use self::query_enhancer::QueryEnhancerBuilder; +pub use self::query_enhancer::QueryEnhancerBuilder; -const NGRAMS: usize = 3; - -pub struct AutomatonProducer { - automatons: Vec, -} - -impl AutomatonProducer { - pub fn new( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_list_store: store::PostingsLists, - synonyms_store: store::Synonyms, - ) -> MResult<(AutomatonProducer, QueryEnhancer)> { - let (automatons, query_enhancer) = generate_automatons( - reader, - query, - main_store, - postings_list_store, - synonyms_store, - )?; - - Ok((AutomatonProducer { automatons }, query_enhancer)) - } - - pub fn into_iter(self) -> vec::IntoIter { - self.automatons.into_iter() - } -} - -#[derive(Debug)] -pub struct AutomatonGroup { - pub is_phrase_query: bool, - pub automatons: Vec, -} - -impl AutomatonGroup { - fn normal(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: false, - automatons, - } - } - - fn phrase_query(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: true, - automatons, - } - } -} - -#[derive(Debug)] -pub struct Automaton { - pub index: usize, - pub ngram: usize, - pub query_len: usize, - pub is_exact: bool, - pub is_prefix: bool, - pub query: String, -} - -impl Automaton { - pub fn dfa(&self) -> DFA { - if self.is_prefix { - build_prefix_dfa(&self.query) - } else { - build_dfa(&self.query) - } - } - - fn exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: false, - query: query.to_string(), - } - } - - fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: true, - query: query.to_string(), - } - } - - fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: false, - is_prefix: false, - query: query.to_string(), - } - } -} +pub const NGRAMS: usize = 3; pub fn normalize_str(string: &str) -> String { let mut string = string.to_lowercase(); @@ -130,167 +18,3 @@ pub fn normalize_str(string: &str) -> String { string } - -fn split_best_frequency<'a>( - reader: &heed::RoTxn, - word: &'a str, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = postings_lists_store - .postings_list(reader, left.as_ref())? - .map_or(0, |i| i.len()); - - let right_freq = postings_lists_store - .postings_list(reader, right.as_ref())? - .map_or(0, |i| i.len()); - - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - -fn generate_automatons( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - synonym_store: store::Synonyms, -) -> MResult<(Vec, QueryEnhancer)> { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let synonyms = match main_store.synonyms_fst(reader)? { - Some(synonym) => synonym, - None => fst::Set::default(), - }; - - let mut automaton_index = 0; - let mut automatons = Vec::new(); - let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - - // We must not declare the original words to the query enhancer - // *but* we need to push them in the automatons list first - let mut original_automatons = Vec::new(); - let mut original_words = query_words.iter().peekable(); - while let Some(word) = original_words.next() { - let has_following_word = original_words.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - - let automaton = if not_prefix_dfa { - Automaton::exact(automaton_index, 1, word) - } else { - Automaton::prefix_exact(automaton_index, 1, word) - }; - automaton_index += 1; - original_automatons.push(automaton); - } - - automatons.push(AutomatonGroup::normal(original_automatons)); - - for n in 1..=NGRAMS { - let mut ngrams = query_words.windows(n).enumerate().peekable(); - while let Some((query_index, ngram_slice)) = ngrams.next() { - let query_range = query_index..query_index + n; - let ngram_nb_words = ngram_slice.len(); - let ngram = ngram_slice.join(" "); - - let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = - has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); - - // automaton of synonyms of the ngrams - let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { - build_dfa(&normalized) - } else { - build_prefix_dfa(&normalized) - }; - - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(base) = stream.next() { - // only trigger alternatives when the last word has been typed - // i.e. "new " do not but "new yo" triggers alternatives to "new york" - let base = std::str::from_utf8(base).unwrap(); - let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { - continue; - } - - if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); - let nb_synonym_words = synonyms_words.len(); - - let real_query_index = automaton_index; - enhancer_builder.declare( - query_range.clone(), - real_query_index, - &synonyms_words, - ); - - for synonym in synonyms_words { - let automaton = if nb_synonym_words == 1 { - Automaton::exact(automaton_index, n, synonym) - } else { - Automaton::non_exact(automaton_index, n, synonym) - }; - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - } - - if n == 1 { - if let Some((left, right)) = - split_best_frequency(reader, &normalized, postings_lists_store)? - { - let a = Automaton::exact(automaton_index, 1, left); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - let b = Automaton::exact(automaton_index, 1, right); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - automatons.push(AutomatonGroup::phrase_query(vec![a, b])); - } - } else { - // automaton of concatenation of query words - let concat = ngram_slice.concat(); - let normalized = normalize_str(&concat); - - let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - - let automaton = Automaton::exact(automaton_index, n, &normalized); - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - - // order automatons, the most important first, - // we keep the original automatons at the front. - automatons[1..].sort_by_key(|group| { - let a = group.automatons.first().unwrap(); - ( - Reverse(a.is_exact), - a.ngram, - Reverse(group.automatons.len()), - ) - }); - - Ok((automatons, enhancer_builder.build())) -} diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index 2194f3ff1..4b7582dd5 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -58,6 +58,7 @@ where type Origin = usize; type RealLength = usize; +#[derive(Debug)] struct FakeIntervalTree { intervals: Vec<(Range, (Origin, RealLength))>, } @@ -142,67 +143,80 @@ impl> QueryEnhancerBuilder<'_, S> { // we need to pad real query indices let real_range = real..real + replacement.len().max(range.len()); let real_length = replacement.len(); - self.real_to_origin - .push((real_range, (range.start, real_length))); + self.real_to_origin.push((real_range, (range.start, real_length))); } pub fn build(self) -> QueryEnhancer { - QueryEnhancer { - origins: self.origins, - real_to_origin: FakeIntervalTree::new(self.real_to_origin), + let interval_tree = FakeIntervalTree::new(self.real_to_origin); + let mut table = Vec::new(); + + for real in 0.. { + match replacement(&self.origins, &interval_tree, real) { + Some(range) => table.push(range), + None => break, + } } + + QueryEnhancer { table } } } +/// Returns the query indices that represent this real query index. +fn replacement( + origins: &[usize], + real_to_origin: &FakeIntervalTree, + real: u32, +) -> Option> +{ + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = real_to_origin.query(real)?; + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { + new_origin = origin + i; + break; + } + } + + let n = real - range.start; + let start = origins[origin]; + let end = origins.get(new_origin + 1)?; + let remaining = (end - start) - n; + + Some(Range { + start: (start + n) as u32, + end: (start + n + remaining) as u32, + }) + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = origins[origin]; + + Some(Range { + start: (origin + n) as u32, + end: (origin + n + 1) as u32, + }) + } +} + +#[derive(Debug)] pub struct QueryEnhancer { - origins: Vec, - real_to_origin: FakeIntervalTree, + table: Vec>, } impl QueryEnhancer { - /// Returns the query indices to use to replace this real query index. + /// Returns the query indices that represent this real query index. pub fn replacement(&self, real: u32) -> Range { - let real = real as usize; - - // query the fake interval tree with the real query index - let (range, (origin, real_length)) = self - .real_to_origin - .query(real) - .expect("real has never been declared"); - - // if `real` is the end bound of the range - if (range.start + real_length - 1) == real { - let mut count = range.len(); - let mut new_origin = origin; - for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { - let len = slice[1] - slice[0]; - count = count.saturating_sub(len); - if count == 0 { - new_origin = origin + i; - break; - } - } - - let n = real - range.start; - let start = self.origins[origin]; - let end = self.origins[new_origin + 1]; - let remaining = (end - start) - n; - - Range { - start: (start + n) as u32, - end: (start + n + remaining) as u32, - } - } else { - // just return the origin along with - // the real position of the word - let n = real as usize - range.start; - let origin = self.origins[origin]; - - Range { - start: (origin + n) as u32, - end: (origin + n + 1) as u32, - } - } + self.table[real as usize].clone() } } diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs new file mode 100644 index 000000000..5a819962e --- /dev/null +++ b/meilisearch-core/src/bucket_sort.rs @@ -0,0 +1,717 @@ +use std::ops::Deref; +use std::{cmp, fmt}; +use std::borrow::Cow; +use std::mem; +use std::ops::Range; +use std::rc::Rc; +use std::time::{Duration, Instant}; + +use compact_arena::{SmallArena, Idx32, mk_arena}; +use fst::{IntoStreamer, Streamer}; +use hashbrown::HashMap; +use levenshtein_automata::DFA; +use log::debug; +use meilisearch_tokenizer::{is_cjk, split_query_string}; +use meilisearch_types::DocIndex; +use sdset::{Set, SetBuf}; +use slice_group_by::{GroupBy, GroupByMut}; + +use crate::automaton::NGRAMS; +use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; +use crate::automaton::normalize_str; +use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; + +use crate::criterion::{Criteria, Context, ContextMut}; +use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; +use crate::raw_document::RawDocument; +use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; +use crate::{store, Document, DocumentId, MResult}; + +pub fn bucket_sort<'c, FI>( + reader: &heed::RoTxn, + query: &str, + range: Range, + filter: Option, + criteria: Criteria<'c>, + searchable_attrs: Option, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +where + FI: Fn(DocumentId) -> bool, +{ + // We delegate the filter work to the distinct query builder, + // specifying a distinct rule that has no effect. + if filter.is_some() { + let distinct = |_| None; + let distinct_size = 1; + return bucket_sort_with_distinct( + reader, + query, + range, + filter, + distinct, + distinct_size, + criteria, + searchable_attrs, + main_store, + postings_lists_store, + documents_fields_counts_store, + synonyms_store, + ); + } + + let (mut automatons, mut query_enhancer) = + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + + debug!("{:?}", query_enhancer); + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = + fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut prefiltered_documents = 0; + let mut raw_documents = Vec::new(); + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + prefiltered_documents += 1; + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { + raw_documents.push(raw_document); + } + } + debug!("creating {} (original {}) candidates documents took {:.02?}", + raw_documents.len(), + prefiltered_documents, + before_raw_documents_building.elapsed(), + ); + + let mut groups = vec![raw_documents.as_mut_slice()]; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for mut group in tmp_groups { + let before_criterion_preparation = Instant::now(); + + let ctx = ContextMut { + reader, + postings_lists: &mut arena, + query_enhancer: &mut query_enhancer, + automatons: &mut automatons, + documents_fields_counts_store, + }; + + criterion.prepare(ctx, &mut group)?; + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let ctx = Context { + postings_lists: &arena, + query_enhancer: &query_enhancer, + automatons: &automatons, + }; + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { + debug!("{:?} produced a group of size {}", criterion.name(), group.len()); + + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { + continue 'criteria; + } + } + } + } + + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref())); + + Ok(iter.collect()) +} + +pub fn bucket_sort_with_distinct<'c, FI, FD>( + reader: &heed::RoTxn, + query: &str, + range: Range, + filter: Option, + distinct: FD, + distinct_size: usize, + criteria: Criteria<'c>, + searchable_attrs: Option, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +where + FI: Fn(DocumentId) -> bool, + FD: Fn(DocumentId) -> Option, +{ + let (mut automatons, mut query_enhancer) = + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut prefiltered_documents = 0; + let mut raw_documents = Vec::new(); + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + prefiltered_documents += 1; + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { + raw_documents.push(raw_document); + } + } + debug!("creating {} (original {}) candidates documents took {:.02?}", + raw_documents.len(), + prefiltered_documents, + before_raw_documents_building.elapsed(), + ); + + let mut groups = vec![raw_documents.as_mut_slice()]; + let mut key_cache = HashMap::new(); + + let mut filter_map = HashMap::new(); + // these two variables informs on the current distinct map and + // on the raw offset of the start of the group where the + // range.start bound is located according to the distinct function + let mut distinct_map = DistinctMap::new(distinct_size); + let mut distinct_raw_offset = 0; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); + let mut documents_seen = 0; + + for mut group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < distinct_raw_offset { + documents_seen += group.len(); + groups.push(group); + continue; + } + + let ctx = ContextMut { + reader, + postings_lists: &mut arena, + query_enhancer: &mut query_enhancer, + automatons: &mut automatons, + documents_fields_counts_store, + }; + + let before_criterion_preparation = Instant::now(); + criterion.prepare(ctx, &mut group)?; + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let ctx = Context { + postings_lists: &arena, + query_enhancer: &query_enhancer, + automatons: &automatons, + }; + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { + // we must compute the real distinguished len of this sub-group + for document in group.iter() { + let filter_accepted = match &filter { + Some(filter) => { + let entry = filter_map.entry(document.id); + *entry.or_insert_with(|| (filter)(document.id)) + } + None => true, + }; + + if filter_accepted { + let entry = key_cache.entry(document.id); + let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); + + match key.clone() { + Some(key) => buf_distinct.register(key), + None => buf_distinct.register_without_key(), + }; + } + + // the requested range end is reached: stop computing distinct + if buf_distinct.len() >= range.end { + break; + } + } + + documents_seen += group.len(); + groups.push(group); + + // if this sub-group does not overlap with the requested range + // we must update the distinct map and its start index + if buf_distinct.len() < range.start { + buf_distinct.transfert_to_internal(); + distinct_raw_offset = documents_seen; + } + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if buf_distinct.len() >= range.end { + continue 'criteria; + } + } + } + } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let mut seen = BufferedDistinctMap::new(&mut distinct_map); + + let mut documents = Vec::with_capacity(range.len()); + for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { + let filter_accepted = match &filter { + Some(_) => filter_map.remove(&raw_document.id).unwrap(), + None => true, + }; + + if filter_accepted { + let key = key_cache.remove(&raw_document.id).unwrap(); + let distinct_accepted = match key { + Some(key) => seen.register(key), + None => seen.register_without_key(), + }; + + if distinct_accepted && seen.len() > range.start { + documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref())); + if documents.len() == range.len() { + break; + } + } + } + } + + Ok(documents) +} + +pub struct BareMatch<'tag> { + pub document_id: DocumentId, + pub query_index: u16, + pub distance: u8, + pub is_exact: bool, + pub postings_list: Idx32<'tag>, +} + +impl fmt::Debug for BareMatch<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("BareMatch") + .field("document_id", &self.document_id) + .field("query_index", &self.query_index) + .field("distance", &self.distance) + .field("is_exact", &self.is_exact) + .finish() + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct SimpleMatch { + pub query_index: u16, + pub distance: u8, + pub attribute: u16, + pub word_index: u16, + pub is_exact: bool, +} + +#[derive(Clone)] +pub enum PostingsListView<'txn> { + Original { + input: Rc<[u8]>, + postings_list: Rc>>, + offset: usize, + len: usize, + }, + Rewritten { + input: Rc<[u8]>, + postings_list: SetBuf, + }, +} + +impl fmt::Debug for PostingsListView<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PostingsListView") + .field("input", &std::str::from_utf8(&self.input()).unwrap()) + .field("postings_list", &self.as_ref()) + .finish() + } +} + +impl<'txn> PostingsListView<'txn> { + pub fn original(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { + let len = postings_list.len(); + PostingsListView::Original { input, postings_list, offset: 0, len } + } + + pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf) -> PostingsListView<'txn> { + PostingsListView::Rewritten { input, postings_list } + } + + pub fn rewrite_with(&mut self, postings_list: SetBuf) { + let input = match self { + PostingsListView::Original { input, .. } => input.clone(), + PostingsListView::Rewritten { input, .. } => input.clone(), + }; + *self = PostingsListView::rewritten(input, postings_list); + } + + pub fn len(&self) -> usize { + match self { + PostingsListView::Original { len, .. } => *len, + PostingsListView::Rewritten { postings_list, .. } => postings_list.len(), + } + } + + pub fn input(&self) -> &[u8] { + match self { + PostingsListView::Original { ref input, .. } => input, + PostingsListView::Rewritten { ref input, .. } => input, + } + } + + pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> { + match self { + PostingsListView::Original { input, postings_list, offset, len } => { + assert!(range_offset + range_len <= *len); + PostingsListView::Original { + input: input.clone(), + postings_list: postings_list.clone(), + offset: offset + range_offset, + len: range_len, + } + }, + PostingsListView::Rewritten { .. } => { + panic!("Cannot create a range on a rewritten postings list view"); + } + } + } +} + +impl AsRef> for PostingsListView<'_> { + fn as_ref(&self) -> &Set { + self + } +} + +impl Deref for PostingsListView<'_> { + type Target = Set; + + fn deref(&self) -> &Set { + match *self { + PostingsListView::Original { ref postings_list, offset, len, .. } => { + Set::new_unchecked(&postings_list[offset..offset + len]) + }, + PostingsListView::Rewritten { ref postings_list, .. } => postings_list, + } + } +} + +fn fetch_matches<'txn, 'tag>( + reader: &'txn heed::RoTxn, + automatons: &[QueryWordAutomaton], + arena: &mut SmallArena<'tag, PostingsListView<'txn>>, + main_store: store::Main, + postings_lists_store: store::PostingsLists, +) -> MResult>> +{ + let before_words_fst = Instant::now(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; + debug!("words fst took {:.02?}", before_words_fst.elapsed()); + + let mut total_postings_lists = Vec::new(); + + let mut dfa_time = Duration::default(); + let mut stream_next_time = Duration::default(); + let mut postings_lists_fetching_time = Duration::default(); + + for (query_index, automaton) in automatons.iter().enumerate() { + let before_dfa = Instant::now(); + let dfa = automaton.dfa(); + let QueryWordAutomaton { query, is_exact, .. } = automaton; + dfa_time += before_dfa.elapsed(); + + let mut number_of_words = 0; + let mut stream = words.search(&dfa).into_stream(); + + // while let Some(input) = stream.next() { + loop { + let before_stream_next = Instant::now(); + let input = match stream.next() { + Some(input) => input, + None => break, + }; + stream_next_time += before_stream_next.elapsed(); + + number_of_words += 1; + + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == query.len(); + + let before_postings_lists_fetching = Instant::now(); + if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? { + + let input = Rc::from(input); + let postings_list = Rc::new(postings_list); + let postings_list_view = PostingsListView::original(input, postings_list); + + let mut offset = 0; + for group in postings_list_view.linear_group_by_key(|di| di.document_id) { + + let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); + let document_id = group[0].document_id; + let bare_match = BareMatch { + document_id, + query_index: query_index as u16, + distance, + is_exact, + postings_list: posting_list_index, + }; + + total_postings_lists.push(bare_match); + offset += group.len(); + } + } + postings_lists_fetching_time += before_postings_lists_fetching.elapsed(); + } + + debug!("{:?} gives {} words", query, number_of_words); + } + + debug!("stream next took {:.02?}", stream_next_time); + debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time); + debug!("dfa creation took {:.02?}", dfa_time); + + Ok(total_postings_lists) +} + +#[derive(Debug)] +pub struct QueryWordAutomaton { + pub query: String, + /// Is it a word that must be considered exact + /// or is it some derived word (i.e. a synonym) + pub is_exact: bool, + pub is_prefix: bool, + /// If it's a phrase query and what is + /// its index an the length of the phrase + pub phrase_query: Option<(u16, u16)>, +} + +impl QueryWordAutomaton { + pub fn exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: false, + phrase_query: None, + } + } + + pub fn exact_prefix(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: true, + phrase_query: None, + } + } + + pub fn non_exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { + query: query.to_string(), + is_exact: false, + is_prefix: false, + phrase_query: None, + } + } + + pub fn dfa(&self) -> DFA { + if self.phrase_query.is_some() { + build_exact_dfa(&self.query) + } else if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } +} + +fn split_best_frequency<'a>( + reader: &heed::RoTxn, + word: &'a str, + postings_lists_store: store::PostingsLists, +) -> MResult> { + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = postings_lists_store + .postings_list(reader, left.as_ref())? + .map_or(0, |i| i.len()); + + let right_freq = postings_lists_store + .postings_list(reader, right.as_ref())? + .map_or(0, |i| i.len()); + + let min_freq = cmp::min(left_freq, right_freq); + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, l, r)| (l, r))) +} + +fn construct_automatons( + reader: &heed::RoTxn, + query: &str, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + synonym_store: store::Synonyms, +) -> MResult<(Vec, QueryEnhancer)> { + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let synonyms = match main_store.synonyms_fst(reader)? { + Some(synonym) => synonym, + None => fst::Set::default(), + }; + + let mut automaton_index = 0; + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); + + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + QueryWordAutomaton::exact(word) + } else { + QueryWordAutomaton::exact_prefix(word) + }; + automaton_index += 1; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + let query_range = query_index..query_index + n; + let ngram_nb_words = ngram_slice.len(); + let ngram = ngram_slice.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = + has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + // automaton of synonyms of the ngrams + let normalized = normalize_str(&ngram); + let lev = if not_prefix_dfa { + build_dfa(&normalized) + } else { + build_prefix_dfa(&normalized) + }; + + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { + continue; + } + + if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { + let automaton = if nb_synonym_words == 1 { + QueryWordAutomaton::exact(synonym) + } else { + QueryWordAutomaton::non_exact(synonym) + }; + automaton_index += 1; + automatons.push(automaton); + } + } + } + } + + if n == 1 { + // automatons for splitted words + if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { + let mut left_automaton = QueryWordAutomaton::exact(left); + left_automaton.phrase_query = Some((0, 2)); + enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); + automaton_index += 1; + automatons.push(left_automaton); + + let mut right_automaton = QueryWordAutomaton::exact(right); + right_automaton.phrase_query = Some((1, 2)); + enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); + automaton_index += 1; + automatons.push(right_automaton); + } + } else { + // automaton of concatenation of query words + let concat = ngram_slice.concat(); + let normalized = normalize_str(&concat); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = QueryWordAutomaton::exact(&normalized); + automaton_index += 1; + automatons.push(automaton); + } + } + } + + Ok((automatons, enhancer_builder.build())) +} diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs new file mode 100644 index 000000000..cf9efb41b --- /dev/null +++ b/meilisearch-core/src/criterion/attribute.rs @@ -0,0 +1,37 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::{RawDocument, MResult}; +use crate::bucket_sort::SimpleMatch; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; + +pub struct Attribute; + +impl Criterion for Attribute { + fn name(&self) -> &str { "attribute" } + + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + #[inline] + fn sum_of_attribute(matches: &[SimpleMatch]) -> usize { + let mut sum_of_attribute = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_of_attribute += group[0].attribute as usize; + } + sum_of_attribute + } + + let lhs = sum_of_attribute(&lhs.processed_matches); + let rhs = sum_of_attribute(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/document_id.rs b/meilisearch-core/src/criterion/document_id.rs index e4a402d26..2795423f2 100644 --- a/meilisearch-core/src/criterion/document_id.rs +++ b/meilisearch-core/src/criterion/document_id.rs @@ -1,16 +1,16 @@ -use crate::criterion::Criterion; -use crate::RawDocument; use std::cmp::Ordering; +use crate::RawDocument; +use super::{Criterion, Context}; -#[derive(Debug, Clone, Copy)] pub struct DocumentId; impl Criterion for DocumentId { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - lhs.id.cmp(&rhs.id) - } + fn name(&self) -> &str { "stable document id" } - fn name(&self) -> &str { - "DocumentId" + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = &lhs.id; + let rhs = &rhs.id; + + lhs.cmp(rhs) } } diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index e9ae1b5dc..5425d2cc9 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,132 +1,78 @@ -use std::cmp::Ordering; - +use std::cmp::{Ordering, Reverse}; +use std::collections::hash_map::{HashMap, Entry}; use meilisearch_schema::SchemaAttr; -use sdset::Set; use slice_group_by::GroupBy; +use crate::{RawDocument, MResult}; +use crate::bucket_sort::BareMatch; +use super::{Criterion, Context, ContextMut}; -use crate::criterion::Criterion; -use crate::RawDocument; +pub struct Exact; -#[inline] -fn number_exact_matches( - query_index: &[u32], - attribute: &[u16], - is_exact: &[bool], - fields_counts: &Set<(SchemaAttr, u64)>, -) -> usize { - let mut count = 0; - let mut index = 0; +impl Criterion for Exact { + fn name(&self) -> &str { "exact" } - for group in query_index.linear_group() { - let len = group.len(); + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + let store = ctx.documents_fields_counts_store; + let reader = ctx.reader; - let mut found_exact = false; - for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { - if *is_exact { - found_exact = true; - let attr = &attribute[index + pos]; - if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) { - let (_, count) = fields_counts[pos]; - if count == 1 { - return usize::max_value(); + 'documents: for doc in documents { + doc.bare_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + + // mark the document if we find a "one word field" that matches + let mut fields_counts = HashMap::new(); + for group in doc.bare_matches.linear_group_by_key(|bm| bm.query_index) { + for group in group.linear_group_by_key(|bm| bm.is_exact) { + if !group[0].is_exact { break } + + for bm in group { + for di in ctx.postings_lists[bm.postings_list].as_ref() { + + let attr = SchemaAttr(di.attribute); + let count = match fields_counts.entry(attr) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let count = store.document_field_count(reader, doc.id, attr)?; + *entry.insert(count) + }, + }; + + if count == Some(1) { + doc.contains_one_word_field = true; + continue 'documents + } + } } } } } - count += found_exact as usize; - index += len; + Ok(()) } - count -} + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + #[inline] + fn sum_exact_query_words(matches: &[BareMatch]) -> usize { + let mut sum_exact_query_words = 0; -#[derive(Debug, Clone, Copy)] -pub struct Exact; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_exact_query_words += group[0].is_exact as usize; + } -impl Criterion for Exact { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let is_exact = lhs.is_exact(); - let attribute = lhs.attribute(); - let fields_counts = &lhs.fields_counts; + sum_exact_query_words + } - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let rhs = { - let query_index = rhs.query_index(); - let is_exact = rhs.is_exact(); - let attribute = rhs.attribute(); - let fields_counts = &rhs.fields_counts; - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "Exact" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "souliereres rouge" - #[test] - fn easy_case() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[false]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "soulier" - // - // doc0: { 0. "soulier" } - // doc1: { 0. "soulier bleu et blanc" } - #[test] - fn basic() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + // does it contains a "one word field" + lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse() + // if not, with document contains the more exact words + .then_with(|| { + let lhs = sum_exact_query_words(&lhs.bare_matches); + let rhs = sum_exact_query_words(&rhs.bare_matches); + lhs.cmp(&rhs).reverse() + }) } } diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index e94b1b2c7..8d6c8b1f6 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -1,59 +1,75 @@ -mod document_id; +use std::cmp::{self, Ordering}; + +use compact_arena::SmallArena; +use sdset::SetBuf; +use slice_group_by::GroupBy; + +use crate::{store, RawDocument, MResult}; +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; +use crate::database::MainT; + +mod typo; +mod words; +mod proximity; +mod attribute; +mod words_position; mod exact; -mod number_of_words; +mod document_id; mod sort_by_attr; -mod sum_of_typos; -mod sum_of_words_attribute; -mod sum_of_words_position; -mod words_proximity; -use crate::RawDocument; -use std::cmp::Ordering; - -pub use self::{ - document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, - sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, - sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, - words_proximity::WordsProximity, -}; - -pub trait Criterion: Send + Sync { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; +pub use self::typo::Typo; +pub use self::words::Words; +pub use self::proximity::Proximity; +pub use self::attribute::Attribute; +pub use self::words_position::WordsPosition; +pub use self::exact::Exact; +pub use self::document_id::DocumentId; +pub use self::sort_by_attr::SortByAttr; +pub trait Criterion { fn name(&self) -> &str; + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + _documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + Ok(()) + } + + fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + lhs: &RawDocument<'r, 'tag>, + rhs: &RawDocument<'r, 'tag>, + ) -> Ordering; + #[inline] - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - self.evaluate(lhs, rhs) == Ordering::Equal + fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + lhs: &RawDocument<'r, 'tag>, + rhs: &RawDocument<'r, 'tag>, + ) -> bool + { + self.evaluate(ctx, lhs, rhs) == Ordering::Equal } } -impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) - } +pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> { + pub reader: &'h heed::RoTxn, + pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>, + pub query_enhancer: &'q mut QueryEnhancer, + pub automatons: &'a mut [QueryWordAutomaton], + pub documents_fields_counts_store: store::DocumentsFieldsCounts, } -impl Criterion for Box { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) - } +pub struct Context<'p, 'tag, 'txn, 'q, 'a> { + pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>, + pub query_enhancer: &'q QueryEnhancer, + pub automatons: &'a [QueryWordAutomaton], } #[derive(Default)] @@ -103,11 +119,11 @@ pub struct Criteria<'a> { impl<'a> Default for Criteria<'a> { fn default() -> Self { CriteriaBuilder::with_capacity(7) - .add(SumOfTypos) - .add(NumberOfWords) - .add(WordsProximity) - .add(SumOfWordsAttribute) - .add(SumOfWordsPosition) + .add(Typo) + .add(Words) + .add(Proximity) + .add(Attribute) + .add(WordsPosition) .add(Exact) .add(DocumentId) .build() @@ -119,3 +135,162 @@ impl<'a> AsRef<[Box]> for Criteria<'a> { &self.inner } } + +fn prepare_query_distances<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + query_enhancer: &QueryEnhancer, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, +) { + for document in documents { + if !document.processed_distances.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.bare_matches.iter() { + if postings_lists[m.postings_list].is_empty() { continue } + + let range = query_enhancer.replacement(m.query_index as u32); + let new_len = cmp::max(range.end as usize, processed.len()); + processed.resize(new_len, None); + + for index in range { + let index = index as usize; + processed[index] = match processed[index] { + Some(distance) if distance > m.distance => Some(m.distance), + Some(distance) => Some(distance), + None => Some(m.distance), + }; + } + } + + document.processed_distances = processed; + } +} + +fn prepare_bare_matches<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, +) { + for document in documents { + if !document.processed_matches.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.bare_matches.iter() { + let postings_list = &postings_lists[m.postings_list]; + processed.reserve(postings_list.len()); + for di in postings_list.as_ref() { + let simple_match = SimpleMatch { + query_index: m.query_index, + distance: m.distance, + attribute: di.attribute, + word_index: di.word_index, + is_exact: m.is_exact, + }; + processed.push(simple_match); + } + } + + let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + document.processed_matches = processed.into_vec(); + } +} + +fn multiword_rewrite_matches( + matches: &mut [SimpleMatch], + query_enhancer: &QueryEnhancer, +) -> SetBuf +{ + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + + let mut padded_matches = Vec::with_capacity(matches.len()); + + // let before_padding = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + // find the biggest padding + let mut biggest = 0; + for match_ in same_word_index { + let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; + + for nmatch_ in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let query_index = rep.next().unwrap() as u16; + if query_index == padmatch.query_index { + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push(padmatch); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break; + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + // debug!("padding matches took {:.02?}", before_padding.elapsed()); + + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + + SetBuf::from_dirty(padded_matches) +} diff --git a/meilisearch-core/src/criterion/number_of_words.rs b/meilisearch-core/src/criterion/number_of_words.rs deleted file mode 100644 index 6c1218e2f..000000000 --- a/meilisearch-core/src/criterion/number_of_words.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn number_of_query_words(query_index: &[u32]) -> usize { - query_index.linear_group().count() -} - -#[derive(Debug, Clone, Copy)] -pub struct NumberOfWords; - -impl Criterion for NumberOfWords { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - number_of_query_words(query_index) - }; - let rhs = { - let query_index = rhs.query_index(); - number_of_query_words(query_index) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "NumberOfWords" - } -} diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs new file mode 100644 index 000000000..2f3698bae --- /dev/null +++ b/meilisearch-core/src/criterion/proximity.rs @@ -0,0 +1,68 @@ +use std::cmp::{self, Ordering}; +use slice_group_by::GroupBy; +use crate::bucket_sort::{SimpleMatch}; +use crate::{RawDocument, MResult}; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; + +const MAX_DISTANCE: u16 = 8; + +pub struct Proximity; + +impl Criterion for Proximity { + fn name(&self) -> &str { "proximity" } + + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } + } + + fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { + if lhs.attribute != rhs.attribute { MAX_DISTANCE } + else { index_proximity(lhs.word_index, rhs.word_index) } + } + + fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { + let mut min_prox = u16::max_value(); + for a in lhs { + for b in rhs { + let prox = attribute_proximity(*a, *b); + min_prox = cmp::min(min_prox, prox); + } + } + min_prox + } + + fn matches_proximity(matches: &[SimpleMatch],) -> u16 { + let mut proximity = 0; + let mut iter = matches.linear_group_by_key(|m| m.query_index); + + // iterate over groups by windows of size 2 + let mut last = iter.next(); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + proximity += min_proximity(lhs, rhs); + last = Some(rhs); + } + + proximity + } + + let lhs = matches_proximity(&lhs.processed_matches); + let rhs = matches_proximity(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/sort_by_attr.rs b/meilisearch-core/src/criterion/sort_by_attr.rs index 89595e5a5..3fd801550 100644 --- a/meilisearch-core/src/criterion/sort_by_attr.rs +++ b/meilisearch-core/src/criterion/sort_by_attr.rs @@ -1,10 +1,9 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; - -use crate::criterion::Criterion; -use crate::{RankedMap, RawDocument}; use meilisearch_schema::{Schema, SchemaAttr}; +use crate::{RankedMap, RawDocument}; +use super::{Criterion, Context}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -28,11 +27,11 @@ use meilisearch_schema::{Schema, SchemaAttr}; /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; /// /// let builder = CriteriaBuilder::with_capacity(8) -/// .add(SumOfTypos) -/// .add(NumberOfWords) -/// .add(WordsProximity) -/// .add(SumOfWordsAttribute) -/// .add(SumOfWordsPosition) +/// .add(Typo) +/// .add(Words) +/// .add(Proximity) +/// .add(Attribute) +/// .add(WordsPosition) /// .add(Exact) /// .add(custom_ranking) /// .add(DocumentId); @@ -86,8 +85,12 @@ impl<'a> SortByAttr<'a> { } } -impl<'a> Criterion for SortByAttr<'a> { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { +impl Criterion for SortByAttr<'_> { + fn name(&self) -> &str { + "sort by attribute" + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { let lhs = self.ranked_map.get(lhs.id, self.attr); let rhs = self.ranked_map.get(rhs.id, self.attr); @@ -105,10 +108,6 @@ impl<'a> Criterion for SortByAttr<'a> { (None, None) => Ordering::Equal, } } - - fn name(&self) -> &str { - "SortByAttr" - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/meilisearch-core/src/criterion/sum_of_typos.rs b/meilisearch-core/src/criterion/sum_of_typos.rs deleted file mode 100644 index 5cad73b42..000000000 --- a/meilisearch-core/src/criterion/sum_of_typos.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::cmp::Ordering; - -use slice_group_by::GroupBy; - -use crate::criterion::Criterion; -use crate::RawDocument; - -// This function is a wrong logarithmic 10 function. -// It is safe to panic on input number higher than 3, -// the number of typos is never bigger than that. -#[inline] -fn custom_log10(n: u8) -> f32 { - match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) - _ => panic!("invalid number"), - } -} - -#[inline] -fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { - let mut number_words: usize = 0; - let mut sum_typos = 0.0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_typos += custom_log10(distance[index]); - number_words += 1; - index += group.len(); - } - - (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfTypos; - -impl Criterion for SumOfTypos { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - sum_matches_typos(query_index, distance) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - sum_matches_typos(query_index, distance) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "SumOfTypos" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "Geox CEO" - // - // doc0: "Geox SpA: CEO and Executive" - // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" - #[test] - fn one_typo_reference() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0, 1]; - let distance1 = &[1, 0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchette" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn no_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchztte" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn one_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 1]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_attribute.rs b/meilisearch-core/src/criterion/sum_of_words_attribute.rs deleted file mode 100644 index 472d771b7..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_attribute.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { - let mut sum_attributes = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_attributes += attribute[index] as usize; - index += group.len(); - } - - sum_attributes -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsAttribute; - -impl Criterion for SumOfWordsAttribute { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let attribute = lhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - let rhs = { - let query_index = rhs.query_index(); - let attribute = rhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsAttribute" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } - // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } - #[test] - fn title_vs_description() { - let query_index0 = &[0]; - let attribute0 = &[0]; - - let query_index1 = &[0]; - let attribute1 = &[1]; - - let doc0 = sum_matches_attributes(query_index0, attribute0); - let doc1 = sum_matches_attributes(query_index1, attribute1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_position.rs b/meilisearch-core/src/criterion/sum_of_words_position.rs deleted file mode 100644 index 70b8843dc..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_position.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { - let mut sum_word_index = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_word_index += word_index[index] as usize; - index += group.len(); - } - - sum_word_index -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsPosition; - -impl Criterion for SumOfWordsPosition { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let word_index = lhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let word_index = rhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsPosition" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "Botte rouge et soulier noir" - #[test] - fn easy_case() { - let query_index0 = &[0]; - let word_index0 = &[0]; - - let query_index1 = &[0]; - let word_index1 = &[3]; - - let doc0 = sum_matches_attribute_index(query_index0, word_index0); - let doc1 = sum_matches_attribute_index(query_index1, word_index1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs new file mode 100644 index 000000000..2b43c50a9 --- /dev/null +++ b/meilisearch-core/src/criterion/typo.rs @@ -0,0 +1,55 @@ +use std::cmp::Ordering; +use crate::{RawDocument, MResult}; +use super::{Criterion, Context, ContextMut, prepare_query_distances}; + +pub struct Typo; + +impl Criterion for Typo { + fn name(&self) -> &str { "typo" } + + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + Ok(()) + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + // This function is a wrong logarithmic 10 function. + // It is safe to panic on input number higher than 3, + // the number of typos is never bigger than that. + #[inline] + fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } + } + + #[inline] + fn compute_typos(distances: &[Option]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + + for distance in distances { + if let Some(distance) = distance { + sum_typos += custom_log10(*distance); + number_words += 1; + } + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize + } + + let lhs = compute_typos(&lhs.processed_distances); + let rhs = compute_typos(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs new file mode 100644 index 000000000..cfe7c9664 --- /dev/null +++ b/meilisearch-core/src/criterion/words.rs @@ -0,0 +1,31 @@ +use std::cmp::Ordering; +use crate::{RawDocument, MResult}; +use super::{Criterion, Context, ContextMut, prepare_query_distances}; + +pub struct Words; + +impl Criterion for Words { + fn name(&self) -> &str { "words" } + + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + Ok(()) + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + #[inline] + fn number_of_query_words(distances: &[Option]) -> usize { + distances.iter().cloned().filter(Option::is_some).count() + } + + let lhs = number_of_query_words(&lhs.processed_distances); + let rhs = number_of_query_words(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs new file mode 100644 index 000000000..387f0d635 --- /dev/null +++ b/meilisearch-core/src/criterion/words_position.rs @@ -0,0 +1,37 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::bucket_sort::SimpleMatch; +use crate::{RawDocument, MResult}; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; + +pub struct WordsPosition; + +impl Criterion for WordsPosition { + fn name(&self) -> &str { "words position" } + + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) + } + + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + #[inline] + fn sum_words_position(matches: &[SimpleMatch]) -> usize { + let mut sum_words_position = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_words_position += group[0].word_index as usize; + } + sum_words_position + } + + let lhs = sum_words_position(&lhs.processed_matches); + let rhs = sum_words_position(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/words_proximity.rs b/meilisearch-core/src/criterion/words_proximity.rs deleted file mode 100644 index 579bc7b8c..000000000 --- a/meilisearch-core/src/criterion/words_proximity.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::{self, Ordering}; - -const MAX_DISTANCE: u16 = 8; - -#[inline] -fn clone_tuple((a, b): (&T, &U)) -> (T, U) { - (a.clone(), b.clone()) -} - -fn index_proximity(lhs: u16, rhs: u16) -> u16 { - if lhs < rhs { - cmp::min(rhs - lhs, MAX_DISTANCE) - } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 - } -} - -fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { - if lattr != rattr { - return MAX_DISTANCE; - } - index_proximity(lwi, rwi) -} - -fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { - let mut min_prox = u16::max_value(); - - for a in lattr.iter().zip(lwi) { - for b in rattr.iter().zip(rwi) { - let a = clone_tuple(a); - let b = clone_tuple(b); - min_prox = cmp::min(min_prox, attribute_proximity(a, b)); - } - } - - min_prox -} - -fn matches_proximity( - query_index: &[u32], - distance: &[u8], - attribute: &[u16], - word_index: &[u16], -) -> u16 { - let mut query_index_groups = query_index.linear_group(); - let mut proximity = 0; - let mut index = 0; - - let get_attr_wi = |index: usize, group_len: usize| { - // retrieve the first distance group (with the lowest values) - let len = distance[index..index + group_len] - .linear_group() - .next() - .unwrap() - .len(); - - let rattr = &attribute[index..index + len]; - let rwi = &word_index[index..index + len]; - - (rattr, rwi) - }; - - let mut last = query_index_groups.next().map(|group| { - let attr_wi = get_attr_wi(index, group.len()); - index += group.len(); - attr_wi - }); - - // iter by windows of size 2 - while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { - let attr_wi = get_attr_wi(index, rhs.len()); - proximity += min_proximity(lhs, attr_wi); - last = Some(attr_wi); - index += rhs.len(); - } - - proximity -} - -#[derive(Debug, Clone, Copy)] -pub struct WordsProximity; - -impl Criterion for WordsProximity { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - let attribute = lhs.attribute(); - let word_index = lhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - let attribute = rhs.attribute(); - let word_index = rhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "WordsProximity" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn three_different_attributes() { - // "soup" "of the" "the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 0 } - // { id: 2, attr: 1, attr_index: 1 } - // { id: 2, attr: 2, attr_index: 0 } - // { id: 3, attr: 3, attr_index: 1 } - - let query_index = &[0, 1, 2, 2, 3]; - let distance = &[0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 2, 3]; - let word_index = &[0, 0, 1, 0, 1]; - - // soup -> of = 8 - // + of -> the = 1 - // + the -> day = 8 (not 1) - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 17 - ); - } - - #[test] - fn two_different_attributes() { - // "soup day" "soup of the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 0, attr: 1, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 1 } - // { id: 2, attr: 1, attr_index: 2 } - // { id: 3, attr: 0, attr_index: 1 } - // { id: 3, attr: 1, attr_index: 3 } - - let query_index = &[0, 0, 1, 2, 3, 3]; - let distance = &[0, 0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 1, 0, 1]; - let word_index = &[0, 0, 1, 2, 1, 3]; - - // soup -> of = 1 - // + of -> the = 1 - // + the -> day = 1 - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 3 - ); - } -} diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index e9ba84a41..ea36abd42 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -3,7 +3,7 @@ extern crate assert_matches; mod automaton; -pub mod criterion; +mod bucket_sort; mod database; mod distinct_map; mod error; @@ -12,11 +12,12 @@ mod number; mod query_builder; mod ranked_map; mod raw_document; -pub mod raw_indexer; mod reordered_attrs; +mod update; +pub mod criterion; +pub mod raw_indexer; pub mod serde; pub mod store; -mod update; pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; @@ -27,61 +28,105 @@ pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; -#[doc(hidden)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct TmpMatch { - pub query_index: u32, - pub distance: u8, - pub attribute: u16, - pub word_index: u16, - pub is_exact: bool, -} +use compact_arena::SmallArena; +use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; +use crate::levenshtein::prefix_damerau_levenshtein; +use crate::reordered_attrs::ReorderedAttrs; -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Document { pub id: DocumentId, pub highlights: Vec, #[cfg(test)] - pub matches: Vec, + pub matches: Vec, +} + +fn highlights_from_raw_document<'a, 'tag, 'txn>( + raw_document: &RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, +) -> Vec +{ + let mut highlights = Vec::new(); + + for bm in raw_document.bare_matches.iter() { + let postings_list = &arena[bm.postings_list]; + let input = postings_list.input(); + let query = &automatons[bm.query_index as usize].query; + + for di in postings_list.iter() { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + + let attribute = searchable_attrs + .and_then(|sa| sa.reverse(di.attribute)) + .unwrap_or(di.attribute); + + let highlight = Highlight { + attribute: attribute, + char_index: di.char_index, + char_length: covered_area as u16, + }; + + highlights.push(highlight); + } + } + + highlights } impl Document { #[cfg(not(test))] - fn from_raw(raw: RawDocument) -> Document { - Document { - id: raw.id, - highlights: raw.highlights, - } + pub fn from_raw<'a, 'tag, 'txn>( + raw_document: RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, + ) -> Document + { + let highlights = highlights_from_raw_document( + &raw_document, + automatons, + arena, + searchable_attrs, + ); + + Document { id: raw_document.id, highlights } } #[cfg(test)] - fn from_raw(raw: RawDocument) -> Document { - let len = raw.query_index().len(); - let mut matches = Vec::with_capacity(len); + pub fn from_raw<'a, 'tag, 'txn>( + raw_document: RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, + ) -> Document + { + use crate::bucket_sort::SimpleMatch; - let query_index = raw.query_index(); - let distance = raw.distance(); - let attribute = raw.attribute(); - let word_index = raw.word_index(); - let is_exact = raw.is_exact(); + let highlights = highlights_from_raw_document( + &raw_document, + automatons, + arena, + searchable_attrs, + ); - for i in 0..len { - let match_ = TmpMatch { - query_index: query_index[i], - distance: distance[i], - attribute: attribute[i], - word_index: word_index[i], - is_exact: is_exact[i], - }; - matches.push(match_); + let mut matches = Vec::new(); + for sm in raw_document.processed_matches { + let attribute = searchable_attrs + .and_then(|sa| sa.reverse(sm.attribute)) + .unwrap_or(sm.attribute); + + matches.push(SimpleMatch { attribute, ..sm }); } + matches.sort_unstable(); - Document { - id: raw.id, - matches, - highlights: raw.highlights, - } + Document { id: raw_document.id, highlights, matches } } } diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 132dda557..e46858241 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -1,20 +1,9 @@ -use hashbrown::HashMap; -use std::convert::TryFrom; use std::ops::Range; -use std::rc::Rc; -use std::time::{Duration, Instant}; -use std::{cmp, mem}; - -use fst::{IntoStreamer, Streamer}; -use sdset::SetBuf; -use slice_group_by::{GroupBy, GroupByMut}; +use std::time::Duration; use crate::database::MainT; -use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; -use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; -use crate::levenshtein::prefix_damerau_levenshtein; -use crate::raw_document::{raw_documents_from, RawDocument}; -use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; +use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct}; +use crate::{criterion::Criteria, Document, DocumentId}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { @@ -29,249 +18,6 @@ pub struct QueryBuilder<'c, 'f, 'd> { synonyms_store: store::Synonyms, } -fn multiword_rewrite_matches( - mut matches: Vec<(DocumentId, TmpMatch)>, - query_enhancer: &QueryEnhancer, -) -> SetBuf<(DocumentId, TmpMatch)> { - let mut padded_matches = Vec::with_capacity(matches.len()); - - // we sort the matches by word index to make them rewritable - matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); - - // for each attribute of each document - for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { - // padding will only be applied - // to word indices in the same attribute - let mut padding = 0; - let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); - - // for each match at the same position - // in this document attribute - while let Some(same_word_index) = iter.next() { - // find the biggest padding - let mut biggest = 0; - for (id, match_) in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index); - let replacement_len = replacement.len(); - let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); - - if let Some(query_index) = replacement.next() { - let word_index = match_.word_index + padding as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let padmatch = TmpMatch { - query_index, - word_index, - ..*match_ - }; - - for (_, nmatch_) in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index); - let query_index = rep.next().unwrap(); - if query_index == padmatch.query_index { - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) - { - let word_index = - match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push((*id, padmatch)); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break; - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - biggest = biggest.max(replacement_len - 1); - } - } - - padding += biggest; - } - } - - for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { - document_matches.sort_unstable(); - } - - SetBuf::new_unchecked(padded_matches) -} - -fn fetch_raw_documents( - reader: &heed::RoTxn, - automatons_groups: &[AutomatonGroup], - query_enhancer: &QueryEnhancer, - searchables: Option<&ReorderedAttrs>, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, -) -> MResult> { - let mut matches = Vec::new(); - let mut highlights = Vec::new(); - - for group in automatons_groups { - let AutomatonGroup { - is_phrase_query, - automatons, - } = group; - let phrase_query_len = automatons.len(); - - let mut tmp_matches = Vec::new(); - for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { - index, - is_exact, - query_len, - query, - .. - } = automaton; - let dfa = automaton.dfa(); - - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; - - let mut stream = words.search(&dfa).into_stream(); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = *is_exact && distance == 0 && input.len() == *query_len; - - let covered_area = if *query_len > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; - - let doc_indexes = match postings_lists_store.postings_list(reader, input)? { - Some(doc_indexes) => doc_indexes, - None => continue, - }; - - tmp_matches.reserve(doc_indexes.len()); - - for di in doc_indexes.as_ref() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: *index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value()); - let covered_area = cmp::min(covered_area, di.char_length); - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: covered_area, - }; - - tmp_matches.push((di.document_id, id, match_, highlight)); - } - } - } - } - - if *is_phrase_query { - tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); - for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { - for window in group.windows(2) { - let (ida, ia, ma, ha) = window[0]; - let (idb, ib, mb, hb) = window[1]; - - debug_assert_eq!(ida, idb); - - // if matches must follow and actually follows themselves - if ia + 1 == ib && ma.word_index + 1 == mb.word_index { - // TODO we must make it work for phrase query longer than 2 - // if the second match is the last phrase query word - if ib + 1 == phrase_query_len { - // insert first match - matches.push((ida, ma)); - highlights.push((ida, ha)); - - // insert second match - matches.push((idb, mb)); - highlights.push((idb, hb)); - } - } - } - } - } else { - for (id, _, match_, highlight) in tmp_matches { - matches.push((id, match_)); - highlights.push((id, highlight)); - } - } - } - - let matches = multiword_rewrite_matches(matches, &query_enhancer); - let highlights = { - highlights.sort_unstable_by_key(|(id, _)| *id); - SetBuf::new_unchecked(highlights) - }; - - let fields_counts = { - let mut fields_counts = Vec::new(); - for group in matches.linear_group_by_key(|(id, ..)| *id) { - let id = group[0].0; - for result in documents_fields_counts_store.document_fields_counts(reader, id)? { - let (attr, count) = result?; - fields_counts.push((id, attr, count)); - } - } - SetBuf::new(fields_counts).unwrap() - }; - - Ok(raw_documents_from(matches, highlights, fields_counts)) -} - impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn new( main: store::Main, @@ -307,9 +53,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { synonyms_store: synonyms, } } -} -impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn with_filter(&mut self, function: F) where F: Fn(DocumentId) -> bool + 'f, @@ -329,9 +73,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } pub fn add_searchable_attribute(&mut self, attribute: u16) { - let reorders = self - .searchable_attrs - .get_or_insert_with(ReorderedAttrs::new); + let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); reorders.insert_attribute(attribute); } @@ -342,14 +84,13 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range: Range, ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => raw_query_with_distinct( + Some((distinct, distinct_size)) => bucket_sort_with_distinct( reader, query, range, self.filter, distinct, distinct_size, - self.timeout, self.criteria, self.searchable_attrs, self.main_store, @@ -357,12 +98,11 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { self.documents_fields_counts_store, self.synonyms_store, ), - None => raw_query( + None => bucket_sort( reader, query, range, self.filter, - self.timeout, self.criteria, self.searchable_attrs, self.main_store, @@ -374,320 +114,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } } -fn raw_query<'c, FI>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, -{ - // We delegate the filter work to the distinct query builder, - // specifying a distinct rule that has no effect. - if filter.is_some() { - let distinct = |_| None; - let distinct_size = 1; - return raw_query_with_distinct( - reader, - query, - range, - filter, - distinct, - distinct_size, - timeout, - criteria, - searchable_attrs, - main_store, - postings_lists_store, - documents_fields_counts_store, - synonyms_store, - ); - } - - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::with_capacity(range.len()); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - documents_fields_counts_store, - )?; - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let mut groups = vec![raw_documents.as_mut_slice()]; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < range.start { - documents_seen += group.len(); - groups.push(group); - continue; - } - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - documents_seen += group.len(); - groups.push(group); - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { - continue 'criteria; - } - } - } - } - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let iter = raw_documents - .into_iter() - .skip(range.start) - .take(range.len()); - raw_documents_processed.clear(); - raw_documents_processed.extend(iter); - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - -fn raw_query_with_distinct<'c, FI, FD>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - - distinct: FD, - distinct_size: usize, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, - FD: Fn(DocumentId) -> Option, -{ - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::new(); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - documents_fields_counts_store, - )?; - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let mut groups = vec![raw_documents.as_mut_slice()]; - let mut key_cache = HashMap::new(); - - let mut filter_map = HashMap::new(); - // these two variables informs on the current distinct map and - // on the raw offset of the start of the group where the - // range.start bound is located according to the distinct function - let mut distinct_map = DistinctMap::new(distinct_size); - let mut distinct_raw_offset = 0; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < distinct_raw_offset { - documents_seen += group.len(); - groups.push(group); - continue; - } - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - // we must compute the real distinguished len of this sub-group - for document in group.iter() { - let filter_accepted = match &filter { - Some(filter) => { - let entry = filter_map.entry(document.id); - *entry.or_insert_with(|| (filter)(document.id)) - } - None => true, - }; - - if filter_accepted { - let entry = key_cache.entry(document.id); - let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); - - match key.clone() { - Some(key) => buf_distinct.register(key), - None => buf_distinct.register_without_key(), - }; - } - - // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { - break; - } - } - - documents_seen += group.len(); - groups.push(group); - - // if this sub-group does not overlap with the requested range - // we must update the distinct map and its start index - if buf_distinct.len() < range.start { - buf_distinct.transfert_to_internal(); - distinct_raw_offset = documents_seen; - } - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { - continue 'criteria; - } - } - } - } - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let mut seen = BufferedDistinctMap::new(&mut distinct_map); - raw_documents_processed.clear(); - - for document in raw_documents.into_iter().skip(distinct_raw_offset) { - let filter_accepted = match &filter { - Some(_) => filter_map.remove(&document.id).unwrap(), - None => true, - }; - - if filter_accepted { - let key = key_cache.remove(&document.id).unwrap(); - let distinct_accepted = match key { - Some(key) => seen.register(key), - None => seen.register_without_key(), - }; - - if distinct_accepted && seen.len() > range.start { - raw_documents_processed.push(document); - if raw_documents_processed.len() == range.len() { - break; - } - } - } - } - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - #[cfg(test)] mod tests { use super::*; @@ -700,10 +126,11 @@ mod tests { use sdset::SetBuf; use tempfile::TempDir; + use crate::DocIndex; use crate::automaton::normalize_str; + use crate::bucket_sort::SimpleMatch; use crate::database::Database; use crate::store::Index; - use crate::DocIndex; fn set_from_stream<'f, I, S>(stream: I) -> Set where @@ -754,6 +181,16 @@ mod tests { } } + const fn doc_attr_index(document_id: u64, attribute: u16, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute, + word_index, + char_index: 0, + char_length: 0, + } + } + pub struct TempDatabase { database: Database, index: Index, @@ -815,7 +252,7 @@ mod tests { let mut words_fst = BTreeSet::new(); let mut postings_lists = HashMap::new(); - let mut fields_counts = HashMap::<_, u64>::new(); + let mut fields_counts = HashMap::<_, u16>::new(); for (word, indexes) in iter { let word = word.to_lowercase().into_bytes(); @@ -857,11 +294,7 @@ mod tests { writer.commit().unwrap(); - TempDatabase { - database, - index, - _tempdir: tempdir, - } + TempDatabase { database, index, _tempdir: tempdir } } } @@ -882,9 +315,9 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -905,7 +338,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -916,7 +349,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -938,7 +371,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -949,7 +382,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -982,7 +415,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -993,7 +426,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1020,17 +453,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1041,17 +474,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1062,17 +495,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1108,21 +541,18 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1133,21 +563,18 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1178,20 +605,20 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY ± york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // NY ± new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1202,14 +629,14 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york assert_matches!(matches.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1236,13 +663,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1251,18 +678,18 @@ mod tests { let results = builder.query(&reader, "new york subway", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1301,22 +728,19 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1326,23 +750,20 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC - // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); } @@ -1382,24 +803,21 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1410,24 +828,21 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1471,33 +886,33 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1510,39 +925,33 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1569,16 +978,16 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1604,32 +1013,32 @@ mod tests { let results = builder.query(&reader, "NY subway ", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1658,13 +1067,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1689,13 +1098,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1706,13 +1115,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1723,13 +1132,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // téléphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1751,10 +1159,50 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn exact_field_count_one_word() { + let store = TempDatabase::from_iter(vec![ + ("searchengine", &[doc_index(0, 0)][..]), + ("searchengine", &[doc_index(1, 0)][..]), + ("blue", &[doc_index(1, 1)][..]), + ("searchangine", &[doc_index(2, 0)][..]), + ("searchengine", &[doc_index(3, 0)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "searchengine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1779,8 +1227,8 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1811,14 +1259,83 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn searchable_attributes() { + let store = TempDatabase::from_iter(vec![ + ("search", &[doc_attr_index(0, 0, 0)][..]), + ("engine", &[doc_attr_index(0, 0, 1)][..]), + + ("search", &[doc_attr_index(1, 1, 0)][..]), + ("engine", &[doc_attr_index(1, 1, 1)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + // reorderer the searchable attributes + let mut builder = store.query_builder(); + builder.add_searchable_attribute(1); + builder.add_searchable_attribute(0); + + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + // remove a searchable attributes + let mut builder = store.query_builder(); + builder.add_searchable_attribute(1); + + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); diff --git a/meilisearch-core/src/query_enhancer.rs b/meilisearch-core/src/query_enhancer.rs deleted file mode 100644 index 165c1b094..000000000 --- a/meilisearch-core/src/query_enhancer.rs +++ /dev/null @@ -1,398 +0,0 @@ -use std::ops::Range; -use std::cmp::Ordering::{Less, Greater, Equal}; - -/// Return `true` if the specified range can accept the given replacements words. -/// Returns `false` if the replacements words are already present in the original query -/// or if there is fewer replacement words than the range to replace. -// -// -// ## Ignored because already present in original -// -// new york city subway -// -------- ^^^^ -// / \ -// [new york city] -// -// -// ## Ignored because smaller than the original -// -// new york city subway -// ------------- -// \ / -// [new york] -// -// -// ## Accepted because bigger than the original -// -// NYC subway -// --- -// / \ -// / \ -// / \ -// / \ -// / \ -// [new york city] -// -fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool -where S: AsRef, - T: AsRef, -{ - if words.len() <= range.len() { - // there is fewer or equal replacement words - // than there is already in the replaced range - return false - } - - // retrieve the part to rewrite but with the length - // of the replacement part - let original = query.iter().skip(range.start).take(words.len()); - - // check if the original query doesn't already contain - // the replacement words - !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) -} - -type Origin = usize; -type RealLength = usize; - -struct FakeIntervalTree { - intervals: Vec<(Range, (Origin, RealLength))>, -} - -impl FakeIntervalTree { - fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { - intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); - FakeIntervalTree { intervals } - } - - fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { - let element = self.intervals.binary_search_by(|(r, _)| { - if point >= r.start { - if point < r.end { Equal } else { Less } - } else { Greater } - }); - - let n = match element { Ok(n) => n, Err(n) => n }; - - match self.intervals.get(n) { - Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), - _otherwise => None, - } - } -} - -pub struct QueryEnhancerBuilder<'a, S> { - query: &'a [S], - origins: Vec, - real_to_origin: Vec<(Range, (Origin, RealLength))>, -} - -impl> QueryEnhancerBuilder<'_, S> { - pub fn new(query: &[S]) -> QueryEnhancerBuilder { - // we initialize origins query indices based on their positions - let origins: Vec<_> = (0..query.len() + 1).collect(); - let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); - - QueryEnhancerBuilder { query, origins, real_to_origin } - } - - /// Update the final real to origin query indices mapping. - /// - /// `range` is the original words range that this `replacement` words replace - /// and `real` is the first real query index of these replacement words. - pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) - where T: AsRef, - { - // check if the range of original words - // can be rewritten with the replacement words - if rewrite_range_with(self.query, range.clone(), replacement) { - - // this range can be replaced so we need to - // modify the origins accordingly - let offset = replacement.len() - range.len(); - - let previous_padding = self.origins[range.end - 1]; - let current_offset = (self.origins[range.end] - 1) - previous_padding; - let diff = offset.saturating_sub(current_offset); - self.origins[range.end] += diff; - - for r in &mut self.origins[range.end + 1..] { - *r += diff; - } - } - - // we need to store the real number and origins relations - // this way it will be possible to know by how many - // we need to pad real query indices - let real_range = real..real + replacement.len().max(range.len()); - let real_length = replacement.len(); - self.real_to_origin.push((real_range, (range.start, real_length))); - } - - pub fn build(self) -> QueryEnhancer { - QueryEnhancer { - origins: self.origins, - real_to_origin: FakeIntervalTree::new(self.real_to_origin), - } - } -} - -pub struct QueryEnhancer { - origins: Vec, - real_to_origin: FakeIntervalTree, -} - -impl QueryEnhancer { - /// Returns the query indices to use to replace this real query index. - pub fn replacement(&self, real: u32) -> Range { - let real = real as usize; - - // query the fake interval tree with the real query index - let (range, (origin, real_length)) = - self.real_to_origin - .query(real) - .expect("real has never been declared"); - - // if `real` is the end bound of the range - if (range.start + real_length - 1) == real { - let mut count = range.len(); - let mut new_origin = origin; - for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { - let len = slice[1] - slice[0]; - count = count.saturating_sub(len); - if count == 0 { new_origin = origin + i; break } - } - - let n = real - range.start; - let start = self.origins[origin]; - let end = self.origins[new_origin + 1]; - let remaining = (end - start) - n; - - Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } - - } else { - // just return the origin along with - // the real position of the word - let n = real as usize - range.start; - let origin = self.origins[origin]; - - Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn original_unmodified() { - let query = ["new", "york", "city", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..2); // york - assert_eq!(enhancer.replacement(2), 2..3); // city - assert_eq!(enhancer.replacement(3), 3..4); // subway - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - } - - #[test] - fn simple_growing() { - let query = ["new", "york", "subway"]; - // 0 1 2 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 3, &["new", "york", "city"]); - // ^ 3 4 5 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..3); // york - assert_eq!(enhancer.replacement(2), 3..4); // subway - assert_eq!(enhancer.replacement(3), 0..1); // new - assert_eq!(enhancer.replacement(4), 1..2); // york - assert_eq!(enhancer.replacement(5), 2..3); // city - } - - #[test] - fn same_place_growings() { - let query = ["NY", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NY = new york - builder.declare(0..1, 2, &["new", "york"]); - // ^ 2 3 - - // NY = new york city - builder.declare(0..1, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // NY = NYC - builder.declare(0..1, 7, &["NYC"]); - // ^ 7 - - // NY = new york city - builder.declare(0..1, 8, &["new", "york", "city"]); - // ^ 8 9 10 - - // subway = underground train - builder.declare(1..2, 11, &["underground", "train"]); - // ^ 11 12 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NY - assert_eq!(enhancer.replacement(1), 3..5); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..3); // york - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - assert_eq!(enhancer.replacement(7), 0..3); // NYC - assert_eq!(enhancer.replacement(8), 0..1); // new - assert_eq!(enhancer.replacement(9), 1..2); // york - assert_eq!(enhancer.replacement(10), 2..3); // city - assert_eq!(enhancer.replacement(11), 3..4); // underground - assert_eq!(enhancer.replacement(12), 4..5); // train - } - - #[test] - fn bigger_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(0..1, 2, &["new", "york", "city"]); - // ^ 2 3 4 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NYC - assert_eq!(enhancer.replacement(1), 3..4); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..2); // york - assert_eq!(enhancer.replacement(4), 2..3); // city - } - - #[test] - fn middle_query_growing() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..6); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - } - - #[test] - fn end_query_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(1..2, 2, &["underground", "train"]); - // ^ 2 3 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // NYC - assert_eq!(enhancer.replacement(1), 1..3); // subway - assert_eq!(enhancer.replacement(2), 1..2); // underground - assert_eq!(enhancer.replacement(3), 2..3); // train - } - - #[test] - fn multiple_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - } - - #[test] - fn multiple_probable_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - // great awesome = good - builder.declare(0..2, 9, &["good"]); - // ^ 9 - - // awesome NYC = NY - builder.declare(1..3, 10, &["NY"]); - // ^^ 10 - - // NYC subway = metro - builder.declare(2..4, 11, &["metro"]); - // ^^ 11 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - assert_eq!(enhancer.replacement(9), 0..2); // good - assert_eq!(enhancer.replacement(10), 1..5); // NY - assert_eq!(enhancer.replacement(11), 2..5); // metro - } -} diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 291e532be..f047de8e8 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,186 +1,111 @@ -use std::fmt; -use std::sync::Arc; - -use meilisearch_schema::SchemaAttr; +use compact_arena::SmallArena; +use itertools::EitherOrBoth; use sdset::SetBuf; -use slice_group_by::GroupBy; +use crate::DocIndex; +use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; +use crate::reordered_attrs::ReorderedAttrs; -use crate::{DocumentId, Highlight, TmpMatch}; - -#[derive(Clone)] -pub struct RawDocument { - pub id: DocumentId, - pub matches: SharedMatches, - pub highlights: Vec, - pub fields_counts: SetBuf<(SchemaAttr, u64)>, +pub struct RawDocument<'a, 'tag> { + pub id: crate::DocumentId, + pub bare_matches: &'a mut [BareMatch<'tag>], + pub processed_matches: Vec, + /// The list of minimum `distance` found + pub processed_distances: Vec>, + /// Does this document contains a field + /// with one word that is exactly matching + pub contains_one_word_field: bool, } -impl RawDocument { - pub fn query_index(&self) -> &[u32] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .query_index - .get_unchecked(r.start..r.end) - } - } +impl<'a, 'tag> RawDocument<'a, 'tag> { + pub fn new<'txn>( + bare_matches: &'a mut [BareMatch<'tag>], + automatons: &[QueryWordAutomaton], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, + ) -> Option> + { + if let Some(reordered_attrs) = searchable_attrs { + for bm in bare_matches.iter() { + let postings_list = &postings_lists[bm.postings_list]; - pub fn distance(&self) -> &[u8] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } - } + let mut rewritten = Vec::new(); + for di in postings_list.iter() { + if let Some(attribute) = reordered_attrs.get(di.attribute) { + rewritten.push(DocIndex { attribute, ..*di }); + } + } - pub fn attribute(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } - } - - pub fn word_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .word_index - .get_unchecked(r.start..r.end) - } - } - - pub fn is_exact(&self) -> &[bool] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } - } -} - -impl fmt::Debug for RawDocument { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("RawDocument {\r\n")?; - f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "query_index", - self.query_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "distance", - self.distance() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "attribute", - self.attribute() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "word_index", - self.word_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "is_exact", - self.is_exact() - ))?; - f.write_str("}")?; - Ok(()) - } -} - -pub fn raw_documents_from( - matches: SetBuf<(DocumentId, TmpMatch)>, - highlights: SetBuf<(DocumentId, Highlight)>, - fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, -) -> Vec { - let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); - let mut matches2 = Matches::with_capacity(matches.len()); - - let matches = matches.linear_group_by_key(|(id, _)| *id); - let highlights = highlights.linear_group_by_key(|(id, _)| *id); - let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); - - for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { - debug_assert_eq!(mgroup[0].0, hgroup[0].0); - debug_assert_eq!(mgroup[0].0, fgroup[0].0); - - let document_id = mgroup[0].0; - let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); - let end = start + mgroup.len(); - let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); - - docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); - matches2.extend_from_slice(mgroup); - } - - let matches = Arc::new(matches2); - docs_ranges - .into_iter() - .map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { - range, - matches: matches.clone(), - }; - RawDocument { - id, - matches, - highlights, - fields_counts, + let new_postings = SetBuf::from_dirty(rewritten); + postings_lists[bm.postings_list].rewrite_with(new_postings); } + } + + bare_matches.sort_unstable_by_key(|m| m.query_index); + + let mut previous_word = None; + for i in 0..bare_matches.len() { + let a = &bare_matches[i]; + let auta = &automatons[a.query_index as usize]; + + match auta.phrase_query { + Some((0, _)) => { + let b = match bare_matches.get(i + 1) { + Some(b) => b, + None => { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue; + } + }; + + if a.query_index + 1 != b.query_index { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue + } + + let pla = &postings_lists[a.postings_list]; + let plb = &postings_lists[b.postings_list]; + + let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) + }); + + let mut newa = Vec::new(); + let mut newb = Vec::new(); + + for eb in iter { + if let EitherOrBoth::Both(a, b) = eb { + newa.push(*a); + newb.push(*b); + } + } + + if !newa.is_empty() { + previous_word = Some(a.query_index); + } + + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); + }, + Some((1, _)) => { + if previous_word.take() != Some(a.query_index - 1) { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + } + }, + Some((_, _)) => unreachable!(), + None => (), + } + } + + if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + return None + } + + Some(RawDocument { + id: bare_matches[0].document_id, + bare_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), + contains_one_word_field: false, }) - .collect() -} - -#[derive(Debug, Copy, Clone)] -struct Range { - start: usize, - end: usize, -} - -#[derive(Clone)] -pub struct SharedMatches { - range: Range, - matches: Arc, -} - -#[derive(Clone)] -struct Matches { - query_index: Vec, - distance: Vec, - attribute: Vec, - word_index: Vec, - is_exact: Vec, -} - -impl Matches { - fn with_capacity(cap: usize) -> Matches { - Matches { - query_index: Vec::with_capacity(cap), - distance: Vec::with_capacity(cap), - attribute: Vec::with_capacity(cap), - word_index: Vec::with_capacity(cap), - is_exact: Vec::with_capacity(cap), - } - } - - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { - for (_, match_) in matches { - self.query_index.push(match_.query_index); - self.distance.push(match_.distance); - self.attribute.push(match_.attribute); - self.word_index.push(match_.word_index); - self.is_exact.push(match_.is_exact); - } } } diff --git a/meilisearch-core/src/reordered_attrs.rs b/meilisearch-core/src/reordered_attrs.rs index b2f9f1d6c..590cac7b2 100644 --- a/meilisearch-core/src/reordered_attrs.rs +++ b/meilisearch-core/src/reordered_attrs.rs @@ -1,27 +1,31 @@ +use std::cmp; + #[derive(Default, Clone)] pub struct ReorderedAttrs { - count: usize, reorders: Vec>, + reverse: Vec, } impl ReorderedAttrs { pub fn new() -> ReorderedAttrs { - ReorderedAttrs { - count: 0, - reorders: Vec::new(), - } + ReorderedAttrs { reorders: Vec::new(), reverse: Vec::new() } } pub fn insert_attribute(&mut self, attribute: u16) { - self.reorders.resize(attribute as usize + 1, None); - self.reorders[attribute as usize] = Some(self.count as u16); - self.count += 1; + let new_len = cmp::max(attribute as usize + 1, self.reorders.len()); + self.reorders.resize(new_len, None); + self.reorders[attribute as usize] = Some(self.reverse.len() as u16); + self.reverse.push(attribute); } pub fn get(&self, attribute: u16) -> Option { - match self.reorders.get(attribute as usize) { - Some(Some(attribute)) => Some(*attribute), - _ => None, + match self.reorders.get(attribute as usize)? { + Some(attribute) => Some(*attribute), + None => None, } } + + pub fn reverse(&self, attribute: u16) -> Option { + self.reverse.get(attribute as usize).copied() + } } diff --git a/meilisearch-core/src/serde/serializer.rs b/meilisearch-core/src/serde/serializer.rs index c083991f5..2016cd314 100644 --- a/meilisearch-core/src/serde/serializer.rs +++ b/meilisearch-core/src/serde/serializer.rs @@ -325,7 +325,7 @@ where txn, document_id, attribute, - number_of_words as u64, + number_of_words as u16, )?; } } diff --git a/meilisearch-core/src/store/documents_fields_counts.rs b/meilisearch-core/src/store/documents_fields_counts.rs index 72ac7a2f8..0a7eb1bbf 100644 --- a/meilisearch-core/src/store/documents_fields_counts.rs +++ b/meilisearch-core/src/store/documents_fields_counts.rs @@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr; #[derive(Copy, Clone)] pub struct DocumentsFieldsCounts { - pub(crate) documents_fields_counts: heed::Database, OwnedType>, + pub(crate) documents_fields_counts: heed::Database, OwnedType>, } impl DocumentsFieldsCounts { @@ -16,7 +16,7 @@ impl DocumentsFieldsCounts { writer: &mut heed::RwTxn, document_id: DocumentId, attribute: SchemaAttr, - value: u64, + value: u16, ) -> ZResult<()> { let key = DocumentAttrKey::new(document_id, attribute); self.documents_fields_counts.put(writer, &key, &value) @@ -42,7 +42,7 @@ impl DocumentsFieldsCounts { reader: &heed::RoTxn, document_id: DocumentId, attribute: SchemaAttr, - ) -> ZResult> { + ) -> ZResult> { let key = DocumentAttrKey::new(document_id, attribute); match self.documents_fields_counts.get(reader, &key)? { Some(count) => Ok(Some(count)), @@ -79,11 +79,11 @@ impl DocumentsFieldsCounts { } pub struct DocumentFieldsCountsIter<'txn> { - iter: heed::RoRange<'txn, OwnedType, OwnedType>, + iter: heed::RoRange<'txn, OwnedType, OwnedType>, } impl Iterator for DocumentFieldsCountsIter<'_> { - type Item = ZResult<(SchemaAttr, u64)>; + type Item = ZResult<(SchemaAttr, u16)>; fn next(&mut self) -> Option { match self.iter.next() { @@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> { pub struct DocumentsIdsIter<'txn> { last_seen_id: Option, - iter: heed::RoIter<'txn, OwnedType, OwnedType>, + iter: heed::RoIter<'txn, OwnedType, OwnedType>, } impl Iterator for DocumentsIdsIter<'_> { @@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> { } pub struct AllDocumentsFieldsCountsIter<'txn> { - iter: heed::RoIter<'txn, OwnedType, OwnedType>, + iter: heed::RoIter<'txn, OwnedType, OwnedType>, } impl Iterator for AllDocumentsFieldsCountsIter<'_> { - type Item = ZResult<(DocumentId, SchemaAttr, u64)>; + type Item = ZResult<(DocumentId, SchemaAttr, u16)>; fn next(&mut self) -> Option { match self.iter.next() { diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 8079f7168..fb995750d 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> { if let Some(ranking_rules_order) = ranking_order { for rule in ranking_rules_order { match rule.as_str() { - "_sum_of_typos" => builder.push(SumOfTypos), - "_number_of_words" => builder.push(NumberOfWords), - "_word_proximity" => builder.push(WordsProximity), - "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), - "_sum_of_words_position" => builder.push(SumOfWordsPosition), + "_typo" => builder.push(Typo), + "_words" => builder.push(Words), + "_proximity" => builder.push(Proximity), + "_attribute" => builder.push(Attribute), + "_words_position" => builder.push(WordsPosition), "_exact" => builder.push(Exact), _ => { let order = match ranking_rules.get(rule.as_str()) { @@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> { builder.push(DocumentId); return Ok(Some(builder.build())); } else { - builder.push(SumOfTypos); - builder.push(NumberOfWords); - builder.push(WordsProximity); - builder.push(SumOfWordsAttribute); - builder.push(SumOfWordsPosition); + builder.push(Typo); + builder.push(Words); + builder.push(Proximity); + builder.push(Attribute); + builder.push(WordsPosition); builder.push(Exact); for (rule, order) in ranking_rules.iter() { let custom_ranking = match order {