diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index ef9bf5324..e7cb9733b 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,13 +1,8 @@ mod dfa; -mod query_enhancer; use meilisearch_tokenizer::is_cjk; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; -pub use self::query_enhancer::QueryEnhancer; -pub use self::query_enhancer::QueryEnhancerBuilder; - -pub const NGRAMS: usize = 3; pub fn normalize_str(string: &str) -> String { let mut string = string.to_lowercase(); diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs deleted file mode 100644 index 4b7582dd5..000000000 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ /dev/null @@ -1,437 +0,0 @@ -use std::cmp::Ordering::{Equal, Greater, Less}; -use std::ops::Range; - -/// Return `true` if the specified range can accept the given replacements words. -/// Returns `false` if the replacements words are already present in the original query -/// or if there is fewer replacement words than the range to replace. -// -// -// ## Ignored because already present in original -// -// new york city subway -// -------- ^^^^ -// / \ -// [new york city] -// -// -// ## Ignored because smaller than the original -// -// new york city subway -// ------------- -// \ / -// [new york] -// -// -// ## Accepted because bigger than the original -// -// NYC subway -// --- -// / \ -// / \ -// / \ -// / \ -// / \ -// [new york city] -// -fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool -where - S: AsRef, - T: AsRef, -{ - if words.len() <= range.len() { - // there is fewer or equal replacement words - // than there is already in the replaced range - return false; - } - - // retrieve the part to rewrite but with the length - // of the replacement part - let original = query.iter().skip(range.start).take(words.len()); - - // check if the original query doesn't already contain - // the replacement words - !original - .map(AsRef::as_ref) - .eq(words.iter().map(AsRef::as_ref)) -} - -type Origin = usize; -type RealLength = usize; - -#[derive(Debug)] -struct FakeIntervalTree { - intervals: Vec<(Range, (Origin, RealLength))>, -} - -impl FakeIntervalTree { - fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { - intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); - FakeIntervalTree { intervals } - } - - fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { - let element = self.intervals.binary_search_by(|(r, _)| { - if point >= r.start { - if point < r.end { - Equal - } else { - Less - } - } else { - Greater - } - }); - - let n = match element { - Ok(n) => n, - Err(n) => n, - }; - - match self.intervals.get(n) { - Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), - _otherwise => None, - } - } -} - -pub struct QueryEnhancerBuilder<'a, S> { - query: &'a [S], - origins: Vec, - real_to_origin: Vec<(Range, (Origin, RealLength))>, -} - -impl> QueryEnhancerBuilder<'_, S> { - pub fn new(query: &[S]) -> QueryEnhancerBuilder { - // we initialize origins query indices based on their positions - let origins: Vec<_> = (0..=query.len()).collect(); - let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect(); - - QueryEnhancerBuilder { - query, - origins, - real_to_origin, - } - } - - /// Update the final real to origin query indices mapping. - /// - /// `range` is the original words range that this `replacement` words replace - /// and `real` is the first real query index of these replacement words. - pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) - where - T: AsRef, - { - // check if the range of original words - // can be rewritten with the replacement words - if rewrite_range_with(self.query, range.clone(), replacement) { - // this range can be replaced so we need to - // modify the origins accordingly - let offset = replacement.len() - range.len(); - - let previous_padding = self.origins[range.end - 1]; - let current_offset = (self.origins[range.end] - 1) - previous_padding; - let diff = offset.saturating_sub(current_offset); - self.origins[range.end] += diff; - - for r in &mut self.origins[range.end + 1..] { - *r += diff; - } - } - - // we need to store the real number and origins relations - // this way it will be possible to know by how many - // we need to pad real query indices - let real_range = real..real + replacement.len().max(range.len()); - let real_length = replacement.len(); - self.real_to_origin.push((real_range, (range.start, real_length))); - } - - pub fn build(self) -> QueryEnhancer { - let interval_tree = FakeIntervalTree::new(self.real_to_origin); - let mut table = Vec::new(); - - for real in 0.. { - match replacement(&self.origins, &interval_tree, real) { - Some(range) => table.push(range), - None => break, - } - } - - QueryEnhancer { table } - } -} - -/// Returns the query indices that represent this real query index. -fn replacement( - origins: &[usize], - real_to_origin: &FakeIntervalTree, - real: u32, -) -> Option> -{ - let real = real as usize; - - // query the fake interval tree with the real query index - let (range, (origin, real_length)) = real_to_origin.query(real)?; - - // if `real` is the end bound of the range - if (range.start + real_length - 1) == real { - let mut count = range.len(); - let mut new_origin = origin; - for (i, slice) in origins[new_origin..].windows(2).enumerate() { - let len = slice[1] - slice[0]; - count = count.saturating_sub(len); - if count == 0 { - new_origin = origin + i; - break; - } - } - - let n = real - range.start; - let start = origins[origin]; - let end = origins.get(new_origin + 1)?; - let remaining = (end - start) - n; - - Some(Range { - start: (start + n) as u32, - end: (start + n + remaining) as u32, - }) - } else { - // just return the origin along with - // the real position of the word - let n = real as usize - range.start; - let origin = origins[origin]; - - Some(Range { - start: (origin + n) as u32, - end: (origin + n + 1) as u32, - }) - } -} - -#[derive(Debug)] -pub struct QueryEnhancer { - table: Vec>, -} - -impl QueryEnhancer { - /// Returns the query indices that represent this real query index. - pub fn replacement(&self, real: u32) -> Range { - self.table[real as usize].clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn original_unmodified() { - let query = ["new", "york", "city", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..2); // york - assert_eq!(enhancer.replacement(2), 2..3); // city - assert_eq!(enhancer.replacement(3), 3..4); // subway - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - } - - #[test] - fn simple_growing() { - let query = ["new", "york", "subway"]; - // 0 1 2 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 3, &["new", "york", "city"]); - // ^ 3 4 5 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..3); // york - assert_eq!(enhancer.replacement(2), 3..4); // subway - assert_eq!(enhancer.replacement(3), 0..1); // new - assert_eq!(enhancer.replacement(4), 1..2); // york - assert_eq!(enhancer.replacement(5), 2..3); // city - } - - #[test] - fn same_place_growings() { - let query = ["NY", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NY = new york - builder.declare(0..1, 2, &["new", "york"]); - // ^ 2 3 - - // NY = new york city - builder.declare(0..1, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // NY = NYC - builder.declare(0..1, 7, &["NYC"]); - // ^ 7 - - // NY = new york city - builder.declare(0..1, 8, &["new", "york", "city"]); - // ^ 8 9 10 - - // subway = underground train - builder.declare(1..2, 11, &["underground", "train"]); - // ^ 11 12 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NY - assert_eq!(enhancer.replacement(1), 3..5); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..3); // york - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - assert_eq!(enhancer.replacement(7), 0..3); // NYC - assert_eq!(enhancer.replacement(8), 0..1); // new - assert_eq!(enhancer.replacement(9), 1..2); // york - assert_eq!(enhancer.replacement(10), 2..3); // city - assert_eq!(enhancer.replacement(11), 3..4); // underground - assert_eq!(enhancer.replacement(12), 4..5); // train - } - - #[test] - fn bigger_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(0..1, 2, &["new", "york", "city"]); - // ^ 2 3 4 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NYC - assert_eq!(enhancer.replacement(1), 3..4); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..2); // york - assert_eq!(enhancer.replacement(4), 2..3); // city - } - - #[test] - fn middle_query_growing() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..6); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - } - - #[test] - fn end_query_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(1..2, 2, &["underground", "train"]); - // ^ 2 3 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // NYC - assert_eq!(enhancer.replacement(1), 1..3); // subway - assert_eq!(enhancer.replacement(2), 1..2); // underground - assert_eq!(enhancer.replacement(3), 2..3); // train - } - - #[test] - fn multiple_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - } - - #[test] - fn multiple_probable_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - // great awesome = good - builder.declare(0..2, 9, &["good"]); - // ^ 9 - - // awesome NYC = NY - builder.declare(1..3, 10, &["NY"]); - // ^^ 10 - - // NYC subway = metro - builder.declare(2..4, 11, &["metro"]); - // ^^ 11 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - assert_eq!(enhancer.replacement(9), 0..2); // good - assert_eq!(enhancer.replacement(10), 1..5); // NY - assert_eq!(enhancer.replacement(11), 2..5); // metro - } -} diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 1b186b8b8..ef22cafd3 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,29 +1,19 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::collections::HashSet; -use std::convert::TryFrom; use std::mem; use std::ops::Deref; use std::ops::Range; use std::rc::Rc; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::time::{Duration, Instant}; -use std::{cmp, fmt}; +use std::time::Instant; +use std::fmt; use compact_arena::{SmallArena, Idx32, mk_arena}; -use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::DFA; use log::debug; -use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::DocIndex; use sdset::{Set, SetBuf, exponential_search}; use slice_group_by::{GroupBy, GroupByMut}; -use crate::automaton::NGRAMS; -use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; -use crate::automaton::normalize_str; -use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; - use crate::criterion::{Criteria, Context, ContextMut}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; @@ -32,7 +22,6 @@ use crate::{store, Document, DocumentId, MResult}; use crate::query_tree::{create_query_tree, traverse_query_tree}; use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey}; use crate::query_tree::Context as QTContext; -use crate::store::Postings; pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, @@ -87,8 +76,8 @@ where }; let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); - println!("{:?}", operation); - println!("{:?}", mapping); + debug!("operation:\n{:?}", operation); + debug!("mapping:\n{:?}", mapping); fn recurs_operation<'o>(map: &mut HashMap, operation: &'o Operation) { match operation { @@ -106,12 +95,278 @@ where println!("number of postings {:?}", queries.len()); let before = Instant::now(); + mk_arena!(arena); + let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries); + println!("matches cleaned in {:.02?}", before.elapsed()); + let before_bucket_sort = Instant::now(); + + let before_raw_documents_building = Instant::now(); + let mut raw_documents = Vec::new(); + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); + raw_documents.push(raw_document); + } + debug!("creating {} candidates documents took {:.02?}", + raw_documents.len(), + before_raw_documents_building.elapsed(), + ); + + let before_criterion_loop = Instant::now(); + let proximity_count = AtomicUsize::new(0); + + let mut groups = vec![raw_documents.as_mut_slice()]; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for mut group in tmp_groups { + let before_criterion_preparation = Instant::now(); + + let ctx = ContextMut { + reader, + postings_lists: &mut arena, + query_mapping: &mapping, + documents_fields_counts_store, + }; + + criterion.prepare(ctx, &mut group)?; + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let ctx = Context { + postings_lists: &arena, + query_mapping: &mapping, + }; + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { + debug!("{:?} produced a group of size {}", criterion.name(), group.len()); + + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { + continue 'criteria; + } + } + } + } + + debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed()); + debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed)); + + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref())); + let documents = iter.collect(); + + debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed()); + + Ok(documents) +} + +pub fn bucket_sort_with_distinct<'c, FI, FD>( + reader: &heed::RoTxn, + query: &str, + range: Range, + filter: Option, + distinct: FD, + distinct_size: usize, + criteria: Criteria<'c>, + searchable_attrs: Option, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, + _prefix_documents_cache_store: store::PrefixDocumentsCache, + prefix_postings_lists_cache_store: store::PrefixPostingsListsCache, +) -> MResult> +where + FI: Fn(DocumentId) -> bool, + FD: Fn(DocumentId) -> Option, +{ + let words_set = match unsafe { main_store.static_words_fst(reader)? } { + Some(words) => words, + None => return Ok(Vec::new()), + }; + + let context = QTContext { + words_set, + synonyms: synonyms_store, + postings_lists: postings_lists_store, + prefix_postings_lists: prefix_postings_lists_cache_store, + }; + + let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); + debug!("operation:\n{:?}", operation); + debug!("mapping:\n{:?}", mapping); + + fn recurs_operation<'o>(map: &mut HashMap, operation: &'o Operation) { + match operation { + Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)), + Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)), + Operation::Query(query) => { map.insert(query.id, &query.kind); }, + } + } + + let mut queries_kinds = HashMap::new(); + recurs_operation(&mut queries_kinds, &operation); + + let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); + println!("found {} documents", docids.len()); + println!("number of postings {:?}", queries.len()); + + let before = Instant::now(); + mk_arena!(arena); + let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries); + println!("matches cleaned in {:.02?}", before.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut raw_documents = Vec::new(); + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); + raw_documents.push(raw_document); + } + debug!("creating {} candidates documents took {:.02?}", + raw_documents.len(), + before_raw_documents_building.elapsed(), + ); + + let mut groups = vec![raw_documents.as_mut_slice()]; + let mut key_cache = HashMap::new(); + + let mut filter_map = HashMap::new(); + // these two variables informs on the current distinct map and + // on the raw offset of the start of the group where the + // range.start bound is located according to the distinct function + let mut distinct_map = DistinctMap::new(distinct_size); + let mut distinct_raw_offset = 0; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); + let mut documents_seen = 0; + + for mut group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < distinct_raw_offset { + documents_seen += group.len(); + groups.push(group); + continue; + } + + let ctx = ContextMut { + reader, + postings_lists: &mut arena, + query_mapping: &mapping, + documents_fields_counts_store, + }; + + let before_criterion_preparation = Instant::now(); + criterion.prepare(ctx, &mut group)?; + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let ctx = Context { + postings_lists: &arena, + query_mapping: &mapping, + }; + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { + // we must compute the real distinguished len of this sub-group + for document in group.iter() { + let filter_accepted = match &filter { + Some(filter) => { + let entry = filter_map.entry(document.id); + *entry.or_insert_with(|| (filter)(document.id)) + } + None => true, + }; + + if filter_accepted { + let entry = key_cache.entry(document.id); + let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); + + match key.clone() { + Some(key) => buf_distinct.register(key), + None => buf_distinct.register_without_key(), + }; + } + + // the requested range end is reached: stop computing distinct + if buf_distinct.len() >= range.end { + break; + } + } + + documents_seen += group.len(); + groups.push(group); + + // if this sub-group does not overlap with the requested range + // we must update the distinct map and its start index + if buf_distinct.len() < range.start { + buf_distinct.transfert_to_internal(); + distinct_raw_offset = documents_seen; + } + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if buf_distinct.len() >= range.end { + continue 'criteria; + } + } + } + } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let mut seen = BufferedDistinctMap::new(&mut distinct_map); + + let mut documents = Vec::with_capacity(range.len()); + for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { + let filter_accepted = match &filter { + Some(_) => filter_map.remove(&raw_document.id).unwrap(), + None => true, + }; + + if filter_accepted { + let key = key_cache.remove(&raw_document.id).unwrap(); + let distinct_accepted = match key { + Some(key) => seen.register(key), + None => seen.register_without_key(), + }; + + if distinct_accepted && seen.len() > range.start { + documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref())); + if documents.len() == range.len() { + break; + } + } + } + } + + Ok(documents) +} + +fn cleanup_bare_matches<'tag, 'txn>( + arena: &mut SmallArena<'tag, PostingsListView<'txn>>, + docids: &Set, + queries: HashMap>>, +) -> Vec> +{ let docidslen = docids.len() as f32; let mut bare_matches = Vec::new(); - mk_arena!(arena); - for (PostingsKey{ query, input, distance, is_exact }, matches) in queries { + for (PostingsKey { query, input, distance, is_exact }, matches) in queries { let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches)); let pllen = postings_list_view.len() as f32; @@ -168,112 +423,11 @@ where } } - println!("matches cleaned in {:.02?}", before.elapsed()); - - let before_bucket_sort = Instant::now(); - let before_raw_documents_presort = Instant::now(); bare_matches.sort_unstable_by_key(|sm| sm.document_id); debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); - let before_raw_documents_building = Instant::now(); - let mut raw_documents = Vec::new(); - for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); - raw_documents.push(raw_document); - } - debug!("creating {} candidates documents took {:.02?}", - raw_documents.len(), - before_raw_documents_building.elapsed(), - ); - - let before_criterion_loop = Instant::now(); - let proximity_count = AtomicUsize::new(0); - - let mut groups = vec![raw_documents.as_mut_slice()]; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut documents_seen = 0; - - for mut group in tmp_groups { - let before_criterion_preparation = Instant::now(); - - let ctx = ContextMut { - reader, - postings_lists: &mut arena, - query_mapping: &mapping, - documents_fields_counts_store, - }; - - criterion.prepare(ctx, &mut group)?; - debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); - - let ctx = Context { - postings_lists: &arena, - query_mapping: &mapping, - }; - - let must_count = criterion.name() == "proximity"; - - let before_criterion_sort = Instant::now(); - group.sort_unstable_by(|a, b| { - if must_count { - proximity_count.fetch_add(1, Ordering::SeqCst); - } - - criterion.evaluate(&ctx, a, b) - }); - debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { - debug!("{:?} produced a group of size {}", criterion.name(), group.len()); - - documents_seen += group.len(); - groups.push(group); - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { - continue 'criteria; - } - } - } - } - - debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed()); - debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed)); - - let iter = raw_documents.into_iter().skip(range.start).take(range.len()); - let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref())); - let documents = iter.collect(); - - debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed()); - - Ok(documents) -} - -pub fn bucket_sort_with_distinct<'c, FI, FD>( - reader: &heed::RoTxn, - query: &str, - range: Range, - filter: Option, - distinct: FD, - distinct_size: usize, - criteria: Criteria<'c>, - searchable_attrs: Option, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, - prefix_documents_cache_store: store::PrefixDocumentsCache, - prefix_postings_lists_cache_store: store::PrefixPostingsListsCache, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, - FD: Fn(DocumentId) -> Option, -{ - unimplemented!() + bare_matches } pub struct BareMatch<'tag> { diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 044a3943f..971875e76 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -6,7 +6,6 @@ use compact_arena::SmallArena; use sdset::SetBuf; use slice_group_by::GroupBy; -use crate::automaton::QueryEnhancer; use crate::bucket_sort::{SimpleMatch, PostingsListView}; use crate::database::MainT; use crate::query_tree::QueryId; diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 5467ad4df..c7d32fd12 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -9,7 +9,6 @@ use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use meilisearch_tokenizer::split_query_string; use sdset::{Set, SetBuf, SetOperation}; -use slice_group_by::StrGroupBy; use crate::database::MainT; use crate::{store, DocumentId, DocIndex, MResult}; @@ -387,7 +386,7 @@ pub fn traverse_query_tree<'o, 'txn>( { let before = Instant::now(); - let Query { id, prefix, kind } = query; + let Query { prefix, kind, .. } = query; let docids: Cow> = match kind { QueryKind::Tolerant(word) => { if *prefix && word.len() <= 2 { diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index 1a27ce33f..2a401f84e 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -429,7 +429,7 @@ pub fn write_documents_addition_index( main_store: store::Main, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, - prefix_documents_cache_store: store::PrefixDocumentsCache, + _prefix_documents_cache_store: store::PrefixDocumentsCache, ranked_map: &RankedMap, number_of_inserted_documents: usize, indexer: RawIndexer,