diff --git a/Cargo.lock b/Cargo.lock index 2dedeb04a..8034a4add 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -257,6 +257,11 @@ dependencies = [ "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "compact_arena" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "const-random" version = "0.1.6" @@ -937,6 +942,7 @@ dependencies = [ "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", + "compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -946,6 +952,7 @@ dependencies = [ "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "meilisearch-schema 0.8.4", @@ -2648,6 +2655,7 @@ dependencies = [ "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141" "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe" "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59" "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index a268c6605..62da7cfb8 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -10,6 +10,7 @@ arc-swap = "0.4.3" bincode = "1.1.4" byteorder = "1.3.2" chrono = { version = "0.4.9", features = ["serde"] } +compact_arena = "0.4.0" crossbeam-channel = "0.4.0" deunicode = "1.0.0" env_logger = "0.7.0" @@ -35,6 +36,7 @@ assert_matches = "1.3" criterion = "0.3" csv = "1.0.7" indexmap = { version = "1.2.0", features = ["serde-1"] } +jemallocator = "0.3.2" rustyline = { version = "5.0.0", default-features = false } structopt = "0.3.2" tempfile = "3.1.0" diff --git a/meilisearch-core/examples/from_file.rs b/meilisearch-core/examples/from_file.rs index dff8d1b2a..c0b50362c 100644 --- a/meilisearch-core/examples/from_file.rs +++ b/meilisearch-core/examples/from_file.rs @@ -1,5 +1,5 @@ -use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; +use std::collections::btree_map::{BTreeMap, Entry}; use std::error::Error; use std::io::{Read, Write}; use std::iter::FromIterator; @@ -15,6 +15,10 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use meilisearch_core::{Database, Highlight, ProcessedUpdateResult}; use meilisearch_schema::SchemaAttr; +// #[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + #[derive(Debug, StructOpt)] struct IndexCommand { /// The destination where the database must be created. diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 3fd86c73d..406d72ce2 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -13,11 +13,11 @@ use crate::database::MainT; use crate::error::MResult; use crate::store; -use self::dfa::{build_dfa, build_prefix_dfa}; +pub use self::dfa::{build_dfa, build_prefix_dfa}; pub use self::query_enhancer::QueryEnhancer; -use self::query_enhancer::QueryEnhancerBuilder; +pub use self::query_enhancer::QueryEnhancerBuilder; -const NGRAMS: usize = 3; +pub const NGRAMS: usize = 3; pub struct AutomatonProducer { automatons: Vec, @@ -145,7 +145,7 @@ pub fn normalize_str(string: &str) -> String { string } -fn split_best_frequency<'a>( +pub fn split_best_frequency<'a>( reader: &heed::RoTxn, word: &'a str, postings_lists_store: store::PostingsLists, diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index 3b88b1157..5f2ac53cf 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -143,8 +143,7 @@ impl> QueryEnhancerBuilder<'_, S> { // we need to pad real query indices let real_range = real..real + replacement.len().max(range.len()); let real_length = replacement.len(); - self.real_to_origin - .push((real_range, (range.start, real_length))); + self.real_to_origin.push((real_range, (range.start, real_length))); } pub fn build(self) -> QueryEnhancer { @@ -162,7 +161,7 @@ pub struct QueryEnhancer { } impl QueryEnhancer { - /// Returns the query indices to use to replace this real query index. + /// Returns the query indices that represent this real query index. pub fn replacement(&self, real: u32) -> Range { let real = real as usize; diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs new file mode 100644 index 000000000..0fb1fed3b --- /dev/null +++ b/meilisearch-core/src/bucket_sort.rs @@ -0,0 +1,467 @@ +use std::ops::Deref; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::HashSet; +use std::io::Write; +use std::mem; +use std::ops::Range; +use std::rc::Rc; +use std::time::{Duration, Instant}; + +use compact_arena::{SmallArena, Idx32, mk_arena}; +use fst::{IntoStreamer, Streamer}; +use levenshtein_automata::DFA; +use log::debug; +use meilisearch_tokenizer::{is_cjk, split_query_string}; +use meilisearch_types::{DocIndex, Highlight}; +use sdset::Set; +use slice_group_by::{GroupBy, GroupByMut}; + +use crate::automaton::NGRAMS; +use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; +use crate::automaton::{build_dfa, build_prefix_dfa}; +use crate::automaton::{normalize_str, split_best_frequency}; + +use crate::criterion2::*; +use crate::levenshtein::prefix_damerau_levenshtein; +use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; +use crate::{store, Document, DocumentId, MResult}; + +pub fn bucket_sort<'c>( + reader: &heed::RoTxn, + query: &str, + range: Range, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +{ + // let automatons = construct_automatons(query); + let (automatons, query_enhancer) = + construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut raw_documents = Vec::new(); + for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + raw_documents.push(RawDocument { raw_matches, processed_matches: None }); + } + debug!("creating {} candidates documents took {:.02?}", + raw_documents.len(), + before_raw_documents_building.elapsed(), + ); + + dbg!(mem::size_of::()); + dbg!(mem::size_of::()); + + let mut groups = vec![raw_documents.as_mut_slice()]; + + let criteria = [ + Box::new(Typo) as Box, + Box::new(Words), + Box::new(Proximity), + Box::new(Attribute), + Box::new(WordsPosition), + Box::new(Exact), + Box::new(StableDocId), + ]; + + 'criteria: for criterion in &criteria { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for mut group in tmp_groups { + let before_criterion_preparation = Instant::now(); + criterion.prepare(&mut group, &mut arena, &query_enhancer); + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + debug!("{:?} produced a group of size {}", criterion.name(), group.len()); + + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { + continue 'criteria; + } + } + } + } + + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + let iter = iter.map(|d| { + let highlights = d.raw_matches.iter().flat_map(|sm| { + let postings_list = &arena[sm.postings_list]; + let input = postings_list.input(); + let query = &automatons[sm.query_index as usize].query; + postings_list.iter().map(move |m| { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 } + }) + }).collect(); + + Document { + id: d.raw_matches[0].document_id, + highlights, + #[cfg(test)] matches: Vec::new(), + } + }); + + Ok(iter.collect()) +} + +pub struct RawDocument<'a, 'tag> { + pub raw_matches: &'a mut [BareMatch<'tag>], + pub processed_matches: Option>, +} + +pub struct BareMatch<'tag> { + pub document_id: DocumentId, + pub query_index: u16, + pub distance: u8, + pub is_exact: bool, + pub postings_list: Idx32<'tag>, +} + +// TODO remove that +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct SimpleMatch { + pub query_index: u16, + pub distance: u8, + pub attribute: u16, + pub word_index: u16, + pub is_exact: bool, +} + +#[derive(Clone)] +pub struct PostingsListView<'txn> { + input: Rc<[u8]>, + postings_list: Rc>>, + offset: usize, + len: usize, +} + +impl<'txn> PostingsListView<'txn> { + pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { + let len = postings_list.len(); + PostingsListView { input, postings_list, offset: 0, len } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn input(&self) -> &[u8] { + &self.input + } + + pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> { + assert!(offset + len <= self.len); + PostingsListView { + input: self.input.clone(), + postings_list: self.postings_list.clone(), + offset: self.offset + offset, + len: len, + } + } +} + +impl AsRef> for PostingsListView<'_> { + fn as_ref(&self) -> &Set { + Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + } +} + +impl Deref for PostingsListView<'_> { + type Target = Set; + + fn deref(&self) -> &Set { + Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + } +} + +fn fetch_matches<'txn, 'tag>( + reader: &'txn heed::RoTxn, + automatons: &[QueryWordAutomaton], + arena: &mut SmallArena<'tag, PostingsListView<'txn>>, + main_store: store::Main, + postings_lists_store: store::PostingsLists, +) -> MResult>> +{ + let mut before_words_fst = Instant::now(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; + debug!("words fst took {:.02?}", before_words_fst.elapsed()); + + let mut total_postings_lists = Vec::new(); + + let mut dfa_time = Duration::default(); + let mut stream_next_time = Duration::default(); + let mut postings_lists_fetching_time = Duration::default(); + + for (query_index, automaton) in automatons.iter().enumerate() { + let before_dfa = Instant::now(); + let dfa = automaton.dfa(); + let QueryWordAutomaton { index, query, is_exact, is_prefix } = automaton; + dfa_time += before_dfa.elapsed(); + + let mut number_of_words = 0; + + let before_fst_search = Instant::now(); + let mut stream = words.search(&dfa).into_stream(); + debug!("fst search took {:.02?}", before_fst_search.elapsed()); + + // while let Some(input) = stream.next() { + loop { + let before_stream_next = Instant::now(); + let input = match stream.next() { + Some(input) => input, + None => break, + }; + stream_next_time += before_stream_next.elapsed(); + + number_of_words += 1; + + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == query.len(); + + let before_postings_lists_fetching = Instant::now(); + if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? { + + let input = Rc::from(input); + let postings_list = Rc::new(postings_list); + let postings_list_view = PostingsListView::new(input, postings_list); + let mut offset = 0; + for group in postings_list_view.linear_group_by_key(|di| di.document_id) { + + let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); + let document_id = group[0].document_id; + let stuffed = BareMatch { + document_id, + query_index: query_index as u16, + distance, + is_exact, + postings_list: posting_list_index, + }; + + total_postings_lists.push(stuffed); + offset += group.len(); + } + } + postings_lists_fetching_time += before_postings_lists_fetching.elapsed(); + } + + debug!("{:?} gives {} words", query, number_of_words); + } + + debug!("stream next took {:.02?}", stream_next_time); + debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time); + debug!("dfa creation took {:.02?}", dfa_time); + + Ok(total_postings_lists) +} + +#[derive(Debug)] +pub struct QueryWordAutomaton { + index: usize, + query: String, + /// Is it a word that must be considered exact + /// or is it some derived word (i.e. a synonym) + is_exact: bool, + is_prefix: bool, +} + +impl QueryWordAutomaton { + pub fn exact(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: false } + } + + pub fn exact_prefix(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: true } + } + + pub fn non_exact(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: false, is_prefix: false } + } + + pub fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } +} + +// fn construct_automatons(query: &str) -> Vec { +// let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); +// let mut original_words = split_query_string(query).map(str::to_lowercase).peekable(); +// let mut automatons = Vec::new(); + +// while let Some(word) = original_words.next() { +// let has_following_word = original_words.peek().is_some(); +// let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + +// let automaton = if not_prefix_dfa { +// QueryWordAutomaton::exact(word) +// } else { +// QueryWordAutomaton::exact_prefix(word) +// }; + +// automatons.push(automaton); +// } + +// automatons +// } + +fn construct_automatons2( + reader: &heed::RoTxn, + query: &str, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + synonym_store: store::Synonyms, +) -> MResult<(Vec, QueryEnhancer)> { + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let synonyms = match main_store.synonyms_fst(reader)? { + Some(synonym) => synonym, + None => fst::Set::default(), + }; + + let mut automaton_index = 0; + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); + + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + QueryWordAutomaton::exact(word, automaton_index) + } else { + QueryWordAutomaton::exact_prefix(word, automaton_index) + }; + automaton_index += 1; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + let query_range = query_index..query_index + n; + let ngram_nb_words = ngram_slice.len(); + let ngram = ngram_slice.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = + has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + // automaton of synonyms of the ngrams + let normalized = normalize_str(&ngram); + let lev = if not_prefix_dfa { + build_dfa(&normalized) + } else { + build_prefix_dfa(&normalized) + }; + + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { + continue; + } + + if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { + let automaton = if nb_synonym_words == 1 { + QueryWordAutomaton::exact(synonym, automaton_index) + } else { + QueryWordAutomaton::non_exact(synonym, automaton_index) + }; + automaton_index += 1; + automatons.push(automaton); + } + } + } + } + + if n == 1 { + if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { + let left_automaton = QueryWordAutomaton::exact(left, automaton_index); + enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); + automaton_index += 1; + automatons.push(left_automaton); + + let right_automaton = QueryWordAutomaton::exact(right, automaton_index); + enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); + automaton_index += 1; + automatons.push(right_automaton); + + } + } else { + // automaton of concatenation of query words + let concat = ngram_slice.concat(); + let normalized = normalize_str(&concat); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = QueryWordAutomaton::exact(&normalized, automaton_index); + automaton_index += 1; + automatons.push(automaton); + } + } + } + + // // order automatons, the most important first, + // // we keep the original automatons at the front. + // automatons[1..].sort_by_key(|group| { + // let a = group.automatons.first().unwrap(); + // ( + // Reverse(a.is_exact), + // a.ngram, + // Reverse(group.automatons.len()), + // ) + // }); + + Ok((automatons, enhancer_builder.build())) +} diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs new file mode 100644 index 000000000..469f936fa --- /dev/null +++ b/meilisearch-core/src/criterion2.rs @@ -0,0 +1,479 @@ +use std::cmp::{self, Ordering, Reverse}; +use std::borrow::Cow; +use std::sync::atomic::{self, AtomicUsize}; + +use slice_group_by::{GroupBy, GroupByMut}; +use compact_arena::SmallArena; +use sdset::{Set, SetBuf}; + +use crate::{DocIndex, DocumentId}; +use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView}; +use crate::automaton::QueryEnhancer; + +type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; + +pub trait Criterion { + fn name(&self) -> &str; + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ); + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering; + + #[inline] + fn eq<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> bool + { + self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal + } +} + +pub struct Typo; + +impl Criterion for Typo { + fn name(&self) -> &str { "typo" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance)); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + // This function is a wrong logarithmic 10 function. + // It is safe to panic on input number higher than 3, + // the number of typos is never bigger than that. + #[inline] + fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } + } + + #[inline] + fn compute_typos(matches: &[BareMatch]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_typos += custom_log10(group[0].distance); + number_words += 1; + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize + } + + let lhs = compute_typos(&lhs.raw_matches); + let rhs = compute_typos(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +pub struct Words; + +impl Criterion for Words { + fn name(&self) -> &str { "words" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| bm.query_index); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + #[inline] + fn number_of_query_words(matches: &[BareMatch]) -> usize { + matches.linear_group_by_key(|bm| bm.query_index).count() + } + + let lhs = number_of_query_words(&lhs.raw_matches); + let rhs = number_of_query_words(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +fn process_raw_matches<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, +) { + for document in documents { + if document.processed_matches.is_some() { continue } + + let mut processed = Vec::new(); + let document_id = document.raw_matches[0].document_id; + + for m in document.raw_matches.iter() { + let postings_list = &postings_lists[m.postings_list]; + processed.reserve(postings_list.len()); + for di in postings_list.as_ref() { + let simple_match = SimpleMatch { + query_index: m.query_index, + distance: m.distance, + attribute: di.attribute, + word_index: di.word_index, + is_exact: m.is_exact, + }; + processed.push(simple_match); + } + } + + let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + document.processed_matches = Some(processed.into_vec()); + } +} + +pub struct Proximity; + +impl Criterion for Proximity { + fn name(&self) -> &str { "proximity" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + const MAX_DISTANCE: u16 = 8; + + fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } + } + + fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { + if lhs.attribute != rhs.attribute { MAX_DISTANCE } + else { index_proximity(lhs.word_index, rhs.word_index) } + } + + fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { + let mut min_prox = u16::max_value(); + for a in lhs { + for b in rhs { + let prox = attribute_proximity(*a, *b); + min_prox = cmp::min(min_prox, prox); + } + } + min_prox + } + + fn matches_proximity(matches: &[SimpleMatch],) -> u16 { + let mut proximity = 0; + let mut iter = matches.linear_group_by_key(|m| m.query_index); + + // iterate over groups by windows of size 2 + let mut last = iter.next(); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + proximity += min_proximity(lhs, rhs); + last = Some(rhs); + } + + proximity + } + + let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap()); + let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct Attribute; + +impl Criterion for Attribute { + fn name(&self) -> &str { "attribute" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + #[inline] + fn sum_attribute(matches: &[SimpleMatch]) -> usize { + let mut sum_attribute = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_attribute += group[0].attribute as usize; + } + sum_attribute + } + + let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap()); + let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct WordsPosition; + +impl Criterion for WordsPosition { + fn name(&self) -> &str { "words position" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + #[inline] + fn sum_words_position(matches: &[SimpleMatch]) -> usize { + let mut sum_words_position = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_words_position += group[0].word_index as usize; + } + sum_words_position + } + + let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap()); + let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct Exact; + +impl Criterion for Exact { + fn name(&self) -> &str { "exact" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + #[inline] + fn sum_exact_query_words(matches: &[BareMatch]) -> usize { + let mut sum_exact_query_words = 0; + + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_exact_query_words += group[0].is_exact as usize; + } + + sum_exact_query_words + } + + let lhs = sum_exact_query_words(&lhs.raw_matches); + let rhs = sum_exact_query_words(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +pub struct StableDocId; + +impl Criterion for StableDocId { + fn name(&self) -> &str { "stable document id" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + // ... + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + let lhs = &lhs.raw_matches[0].document_id; + let rhs = &rhs.raw_matches[0].document_id; + + lhs.cmp(rhs) + } +} + +pub fn multiword_rewrite_matches( + matches: &mut [SimpleMatch], + query_enhancer: &QueryEnhancer, +) -> SetBuf +{ + let mut padded_matches = Vec::with_capacity(matches.len()); + + // let before_sort = Instant::now(); + // we sort the matches by word index to make them rewritable + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + + // let before_padding = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + // find the biggest padding + let mut biggest = 0; + for match_ in same_word_index { + let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; + + for nmatch_ in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let query_index = rep.next().unwrap() as u16; + if query_index == padmatch.query_index { + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push(padmatch); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break; + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + // debug!("padding matches took {:.02?}", before_padding.elapsed()); + + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + + SetBuf::from_dirty(padded_matches) +} diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 0bc07e27e..3a54168b4 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -18,6 +18,10 @@ pub mod serde; pub mod store; mod update; +// TODO replace +mod bucket_sort; +mod criterion2; + pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; pub use self::number::{Number, ParseNumberError}; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 44f1a1028..7edda5294 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -10,7 +10,7 @@ use log::debug; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; -use crate::database::MainT; +use crate::{bucket_sort::bucket_sort, database::MainT}; use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::levenshtein::prefix_damerau_levenshtein; @@ -34,19 +34,14 @@ fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, ) -> SetBuf<(DocumentId, TmpMatch)> { - if true { - let before_sort = Instant::now(); - matches.sort_unstable(); - let matches = SetBuf::new_unchecked(matches); - debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - return matches; - } - let mut padded_matches = Vec::with_capacity(matches.len()); + let before_sort = Instant::now(); // we sort the matches by word index to make them rewritable matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + let before_padding = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { // padding will only be applied @@ -145,6 +140,8 @@ fn multiword_rewrite_matches( document_matches.sort_unstable(); } + debug!("padding matches took {:.02?}", before_padding.elapsed()); + // With this check we can see that the loop above takes something // like 43% of the search time even when no rewrite is needed. // assert_eq!(before_matches, padded_matches); @@ -163,7 +160,18 @@ fn fetch_raw_documents( let mut matches = Vec::new(); let mut highlights = Vec::new(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; + let before_automatons_groups_loop = Instant::now(); + let mut doc_indexes_rewrite = Duration::default(); + let mut retrieve_postings_lists = Duration::default(); + let mut stream_reserve = Duration::default(); + let mut covered_area_time = Duration::default(); + let mut eval_time = Duration::default(); + for group in automatons_groups { let AutomatonGroup { is_phrase_query, automatons } = group; let phrase_query_len = automatons.len(); @@ -173,29 +181,39 @@ fn fetch_raw_documents( let Automaton { index, is_exact, query_len, query, .. } = automaton; let dfa = automaton.dfa(); - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; + let before_stream_loop = Instant::now(); + let mut stream_count = 0; let mut stream = words.search(&dfa).into_stream(); while let Some(input) = stream.next() { + let before_eval_time = Instant::now(); let distance = dfa.eval(input).to_u8(); + eval_time += before_eval_time.elapsed(); + let is_exact = *is_exact && distance == 0 && input.len() == *query_len; + stream_count += 1; + + let before_covered_area = Instant::now(); let covered_area = if *query_len > input.len() { input.len() } else { prefix_damerau_levenshtein(query.as_bytes(), input).1 }; + covered_area_time += before_covered_area.elapsed(); + let before_retrieve_postings_lists = Instant::now(); let doc_indexes = match postings_lists_store.postings_list(reader, input)? { Some(doc_indexes) => doc_indexes, None => continue, }; + retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); + let before_stream_reserve = Instant::now(); tmp_matches.reserve(doc_indexes.len()); + stream_reserve += before_stream_reserve.elapsed(); + let before_doc_indexes_rewrite = Instant::now(); for di in doc_indexes.as_ref() { let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { @@ -219,7 +237,9 @@ fn fetch_raw_documents( tmp_matches.push((di.document_id, id, match_, highlight)); } } + doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); } + debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); } if *is_phrase_query { @@ -249,6 +269,10 @@ fn fetch_raw_documents( } } else { let before_rerewrite = Instant::now(); + + matches.reserve(tmp_matches.len()); + highlights.reserve(tmp_matches.len()); + for (id, _, match_, highlight) in tmp_matches { matches.push((id, match_)); highlights.push((id, highlight)); @@ -257,13 +281,18 @@ fn fetch_raw_documents( } } debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); + debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); + debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); + debug!("stream reserve took {:.02?}", stream_reserve); + debug!("covered area took {:.02?}", covered_area_time); + debug!("eval value took {:.02?}", eval_time); - { - let mut cloned = matches.clone(); - let before_sort_test = Instant::now(); - cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); - debug!("sorting test took {:.02?}", before_sort_test.elapsed()); - } + // { + // let mut cloned = matches.clone(); + // let before_sort_test = Instant::now(); + // cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); + // debug!("sorting test took {:.02?}", before_sort_test.elapsed()); + // } let before_multiword_rewrite_matches = Instant::now(); debug!("number of matches before rewrite {}", matches.len()); @@ -279,7 +308,6 @@ fn fetch_raw_documents( }; debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - let before_raw_documents = Instant::now(); let raw_documents = raw_documents_from(matches, highlights); debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); @@ -356,29 +384,12 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range: Range, ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => raw_query_with_distinct( + Some((distinct, distinct_size)) => unimplemented!("distinct"), + None => bucket_sort( reader, query, range, - self.filter, - distinct, - distinct_size, - self.timeout, - self.criteria, - self.searchable_attrs, - self.main_store, - self.postings_lists_store, - self.documents_fields_counts_store, - self.synonyms_store, - ), - None => raw_query( - reader, - query, - range, - self.filter, - self.timeout, - self.criteria, - self.searchable_attrs, + // self.criteria, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, @@ -472,6 +483,8 @@ where } } + let before_bucket_sort = Instant::now(); + let mut groups = vec![raw_documents.as_mut_slice()]; 'criteria: for criterion in criteria.as_ref() { @@ -520,6 +533,8 @@ where } } + debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed()); + // once we classified the documents related to the current // automatons we save that as the next valid result let iter = raw_documents