From 8acbdcbbadd3f8ce4391baa2f3d19e8b6009bc03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 13 Jan 2020 14:36:06 +0100 Subject: [PATCH] wip: Make the new query tree work with the criteria --- meilisearch-core/src/bucket_sort.rs | 282 ++++-------------- meilisearch-core/src/criterion/attribute.rs | 6 +- meilisearch-core/src/criterion/exact.rs | 4 +- meilisearch-core/src/criterion/mod.rs | 41 +-- meilisearch-core/src/criterion/proximity.rs | 6 +- meilisearch-core/src/criterion/typo.rs | 6 +- meilisearch-core/src/criterion/words.rs | 6 +- .../src/criterion/words_position.rs | 6 +- meilisearch-core/src/lib.rs | 32 +- meilisearch-core/src/raw_document.rs | 66 +--- 10 files changed, 110 insertions(+), 345 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index b8049987c..37eba6b57 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::HashSet; +use std::convert::TryFrom; use std::mem; use std::ops::Deref; use std::ops::Range; @@ -10,7 +11,6 @@ use std::{cmp, fmt}; use compact_arena::{SmallArena, Idx32, mk_arena}; use fst::{IntoStreamer, Streamer}; -use hashbrown::HashMap; use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; @@ -49,36 +49,6 @@ pub fn bucket_sort<'c, FI>( where FI: Fn(DocumentId) -> bool, { - let words_set = match unsafe { main_store.static_words_fst(reader)? } { - Some(words) => words, - None => return Ok(Vec::new()), - }; - - let context = QTContext { - words_set, - synonyms: synonyms_store, - postings_lists: postings_lists_store, - prefix_postings_lists: prefix_postings_lists_cache_store, - }; - - let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); - println!("{:?}", operation); - println!("{:?}", mapping); - - let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); - println!("found {} documents", docids.len()); - println!("number of postings {:?}", queries.len()); - - let before = Instant::now(); - for ((query, input), matches) in queries { - // TODO optimize the filter by skipping docids that have already been seen - for matches in matches.linear_group_by_key(|m| m.document_id).filter(|ms| docids.contains(&ms[0].document_id)) { - // ... - } - } - - println!("matches cleaned in {:.02?}", before.elapsed()); - // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. if filter.is_some() { @@ -102,47 +72,58 @@ where ); } - let before_bucket_sort = Instant::now(); + let words_set = match unsafe { main_store.static_words_fst(reader)? } { + Some(words) => words, + None => return Ok(Vec::new()), + }; - let (mut automatons, mut query_enhancer) = - construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + let context = QTContext { + words_set, + synonyms: synonyms_store, + postings_lists: postings_lists_store, + prefix_postings_lists: prefix_postings_lists_cache_store, + }; - if let [automaton] = &automatons[..] { - if automaton.is_prefix && automaton.query.len() <= 4 { - let mut prefix = [0; 4]; - let len = cmp::min(4, automaton.query.len()); - prefix[..len].copy_from_slice(&automaton.query.as_bytes()[..len]); + let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); + println!("{:?}", operation); + println!("{:?}", mapping); - let mut documents = Vec::new(); - let iter = prefix_documents_cache_store.prefix_documents(reader, prefix)?; - for result in iter.skip(range.start).take(range.len()) { - let (docid, highlights) = result?; - documents.push(Document::from_highlights(docid, &highlights)); + let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); + println!("found {} documents", docids.len()); + println!("number of postings {:?}", queries.len()); + + let before = Instant::now(); + + let mut bare_matches = Vec::new(); + mk_arena!(arena); + for ((query, input), matches) in queries { + + let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches)); + // TODO optimize the filter by skipping docids that have already been seen + let mut offset = 0; + for matches in postings_list_view.linear_group_by_key(|m| m.document_id) { + let document_id = matches[0].document_id; + if docids.contains(&document_id) { + let range = postings_list_view.range(offset, matches.len()); + let posting_list_index = arena.add(range); + let bare_match = BareMatch { + document_id, + query_index: u16::try_from(query.id).unwrap(), + distance: 0, + is_exact: true, // TODO where can I find this info? + postings_list: posting_list_index, + }; + + bare_matches.push(bare_match); } - if !documents.is_empty() { - return Ok(documents); - } + offset += matches.len(); } } - debug!("{:?}", query_enhancer); + println!("matches cleaned in {:.02?}", before.elapsed()); - let before_postings_lists_fetching = Instant::now(); - mk_arena!(arena); - let mut bare_matches = - fetch_matches( - reader, - &automatons, - &mut arena, - main_store, - postings_lists_store, - prefix_postings_lists_cache_store, - )?; - debug!("bare matches ({}) retrieved in {:.02?}", - bare_matches.len(), - before_postings_lists_fetching.elapsed(), - ); + let before_bucket_sort = Instant::now(); let before_raw_documents_presort = Instant::now(); bare_matches.sort_unstable_by_key(|sm| sm.document_id); @@ -152,14 +133,11 @@ where let mut prefiltered_documents = 0; let mut raw_documents = Vec::new(); for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { - raw_documents.push(raw_document); - } + let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); + raw_documents.push(raw_document); } - debug!("creating {} (original {}) candidates documents took {:.02?}", + debug!("creating {} candidates documents took {:.02?}", raw_documents.len(), - prefiltered_documents, before_raw_documents_building.elapsed(), ); @@ -178,8 +156,7 @@ where let ctx = ContextMut { reader, postings_lists: &mut arena, - query_enhancer: &mut query_enhancer, - automatons: &mut automatons, + query_mapping: &mapping, documents_fields_counts_store, }; @@ -188,8 +165,7 @@ where let ctx = Context { postings_lists: &arena, - query_enhancer: &query_enhancer, - automatons: &automatons, + query_mapping: &mapping, }; let must_count = criterion.name() == "proximity"; @@ -223,7 +199,7 @@ where debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed)); let iter = raw_documents.into_iter().skip(range.start).take(range.len()); - let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref())); + let iter = iter.map(|rd| Document::from_raw(rd, &arena, searchable_attrs.as_ref())); let documents = iter.collect(); debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed()); @@ -251,163 +227,7 @@ where FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, { - let (mut automatons, mut query_enhancer) = - construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; - - let before_postings_lists_fetching = Instant::now(); - mk_arena!(arena); - let mut bare_matches = fetch_matches( - reader, - &automatons, - &mut arena, - main_store, - postings_lists_store, - prefix_postings_lists_cache_store, - )?; - debug!("bare matches ({}) retrieved in {:.02?}", - bare_matches.len(), - before_postings_lists_fetching.elapsed(), - ); - - let before_raw_documents_presort = Instant::now(); - bare_matches.sort_unstable_by_key(|sm| sm.document_id); - debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); - - let before_raw_documents_building = Instant::now(); - let mut prefiltered_documents = 0; - let mut raw_documents = Vec::new(); - for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { - raw_documents.push(raw_document); - } - } - debug!("creating {} (original {}) candidates documents took {:.02?}", - raw_documents.len(), - prefiltered_documents, - before_raw_documents_building.elapsed(), - ); - - let mut groups = vec![raw_documents.as_mut_slice()]; - let mut key_cache = HashMap::new(); - - let mut filter_map = HashMap::new(); - // these two variables informs on the current distinct map and - // on the raw offset of the start of the group where the - // range.start bound is located according to the distinct function - let mut distinct_map = DistinctMap::new(distinct_size); - let mut distinct_raw_offset = 0; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); - let mut documents_seen = 0; - - for mut group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < distinct_raw_offset { - documents_seen += group.len(); - groups.push(group); - continue; - } - - let ctx = ContextMut { - reader, - postings_lists: &mut arena, - query_enhancer: &mut query_enhancer, - automatons: &mut automatons, - documents_fields_counts_store, - }; - - let before_criterion_preparation = Instant::now(); - criterion.prepare(ctx, &mut group)?; - debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); - - let ctx = Context { - postings_lists: &arena, - query_enhancer: &query_enhancer, - automatons: &automatons, - }; - - let before_criterion_sort = Instant::now(); - group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); - debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { - // we must compute the real distinguished len of this sub-group - for document in group.iter() { - let filter_accepted = match &filter { - Some(filter) => { - let entry = filter_map.entry(document.id); - *entry.or_insert_with(|| (filter)(document.id)) - } - None => true, - }; - - if filter_accepted { - let entry = key_cache.entry(document.id); - let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); - - match key.clone() { - Some(key) => buf_distinct.register(key), - None => buf_distinct.register_without_key(), - }; - } - - // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { - break; - } - } - - documents_seen += group.len(); - groups.push(group); - - // if this sub-group does not overlap with the requested range - // we must update the distinct map and its start index - if buf_distinct.len() < range.start { - buf_distinct.transfert_to_internal(); - distinct_raw_offset = documents_seen; - } - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { - continue 'criteria; - } - } - } - } - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let mut seen = BufferedDistinctMap::new(&mut distinct_map); - - let mut documents = Vec::with_capacity(range.len()); - for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { - let filter_accepted = match &filter { - Some(_) => filter_map.remove(&raw_document.id).unwrap(), - None => true, - }; - - if filter_accepted { - let key = key_cache.remove(&raw_document.id).unwrap(); - let distinct_accepted = match key { - Some(key) => seen.register(key), - None => seen.register_without_key(), - }; - - if distinct_accepted && seen.len() > range.start { - documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref())); - if documents.len() == range.len() { - break; - } - } - } - } - - Ok(documents) + unimplemented!() } pub struct BareMatch<'tag> { diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs index cf9efb41b..bf28330d2 100644 --- a/meilisearch-core/src/criterion/attribute.rs +++ b/meilisearch-core/src/criterion/attribute.rs @@ -9,13 +9,13 @@ pub struct Attribute; impl Criterion for Attribute { fn name(&self) -> &str { "attribute" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); Ok(()) } diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index 5425d2cc9..93729ee58 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -11,9 +11,9 @@ pub struct Exact; impl Criterion for Exact { fn name(&self) -> &str { "exact" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 8d6c8b1f6..13ca1c58c 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -1,13 +1,16 @@ use std::cmp::{self, Ordering}; +use std::collections::HashMap; +use std::ops::Range; use compact_arena::SmallArena; use sdset::SetBuf; use slice_group_by::GroupBy; -use crate::{store, RawDocument, MResult}; use crate::automaton::QueryEnhancer; use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; use crate::database::MainT; +use crate::query_tree::QueryId; +use crate::{store, RawDocument, MResult}; mod typo; mod words; @@ -30,26 +33,26 @@ pub use self::sort_by_attr::SortByAttr; pub trait Criterion { fn name(&self) -> &str; - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, _documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { Ok(()) } - fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn evaluate<'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + ctx: &Context<'p, 'tag, 'txn, 'q>, lhs: &RawDocument<'r, 'tag>, rhs: &RawDocument<'r, 'tag>, ) -> Ordering; #[inline] - fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn eq<'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + ctx: &Context<'p, 'tag, 'txn, 'q>, lhs: &RawDocument<'r, 'tag>, rhs: &RawDocument<'r, 'tag>, ) -> bool @@ -58,18 +61,16 @@ pub trait Criterion { } } -pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> { +pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> { pub reader: &'h heed::RoTxn, pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>, - pub query_enhancer: &'q mut QueryEnhancer, - pub automatons: &'a mut [QueryWordAutomaton], + pub query_mapping: &'q HashMap>, pub documents_fields_counts_store: store::DocumentsFieldsCounts, } -pub struct Context<'p, 'tag, 'txn, 'q, 'a> { +pub struct Context<'p, 'tag, 'txn, 'q> { pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>, - pub query_enhancer: &'q QueryEnhancer, - pub automatons: &'a [QueryWordAutomaton], + pub query_mapping: &'q HashMap>, } #[derive(Default)] @@ -138,7 +139,7 @@ impl<'a> AsRef<[Box]> for Criteria<'a> { fn prepare_query_distances<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], - query_enhancer: &QueryEnhancer, + query_mapping: &HashMap>, postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, ) { for document in documents { @@ -148,7 +149,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( for m in document.bare_matches.iter() { if postings_lists[m.postings_list].is_empty() { continue } - let range = query_enhancer.replacement(m.query_index as u32); + let range = query_mapping[&(m.query_index as usize)].clone(); let new_len = cmp::max(range.end as usize, processed.len()); processed.resize(new_len, None); @@ -169,7 +170,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( fn prepare_bare_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, + query_mapping: &HashMap>, ) { for document in documents { if !document.processed_matches.is_empty() { continue } @@ -190,14 +191,14 @@ fn prepare_bare_matches<'a, 'tag, 'txn>( } } - let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + let processed = multiword_rewrite_matches(&mut processed, query_mapping); document.processed_matches = processed.into_vec(); } } fn multiword_rewrite_matches( matches: &mut [SimpleMatch], - query_enhancer: &QueryEnhancer, + query_mapping: &HashMap>, ) -> SetBuf { matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); @@ -218,7 +219,7 @@ fn multiword_rewrite_matches( // find the biggest padding let mut biggest = 0; for match_ in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let mut replacement = query_mapping[&(match_.query_index as usize)].clone(); let replacement_len = replacement.len(); let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); @@ -240,7 +241,7 @@ fn multiword_rewrite_matches( let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; for nmatch_ in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone(); let query_index = rep.next().unwrap() as u16; if query_index == padmatch.query_index { if !found { diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs index 2f3698bae..c6a606d56 100644 --- a/meilisearch-core/src/criterion/proximity.rs +++ b/meilisearch-core/src/criterion/proximity.rs @@ -11,13 +11,13 @@ pub struct Proximity; impl Criterion for Proximity { fn name(&self) -> &str { "proximity" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); Ok(()) } diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs index 2b43c50a9..ca3f6212e 100644 --- a/meilisearch-core/src/criterion/typo.rs +++ b/meilisearch-core/src/criterion/typo.rs @@ -7,13 +7,13 @@ pub struct Typo; impl Criterion for Typo { fn name(&self) -> &str { "typo" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists); Ok(()) } diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs index cfe7c9664..1a171ee1e 100644 --- a/meilisearch-core/src/criterion/words.rs +++ b/meilisearch-core/src/criterion/words.rs @@ -7,13 +7,13 @@ pub struct Words; impl Criterion for Words { fn name(&self) -> &str { "words" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists); Ok(()) } diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs index 387f0d635..037e14de6 100644 --- a/meilisearch-core/src/criterion/words_position.rs +++ b/meilisearch-core/src/criterion/words_position.rs @@ -9,13 +9,13 @@ pub struct WordsPosition; impl Criterion for WordsPosition { fn name(&self) -> &str { "words position" } - fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( &self, - ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); Ok(()) } diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index fa16ed77a..6c0ac5be8 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -97,17 +97,19 @@ impl Document { #[cfg(not(test))] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, - automatons: &[QueryWordAutomaton], + // automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, ) -> Document { - let highlights = highlights_from_raw_document( - &raw_document, - automatons, - arena, - searchable_attrs, - ); + // let highlights = highlights_from_raw_document( + // &raw_document, + // automatons, + // arena, + // searchable_attrs, + // ); + + let highlights = Vec::new(); Document { id: raw_document.id, highlights } } @@ -115,19 +117,21 @@ impl Document { #[cfg(test)] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, - automatons: &[QueryWordAutomaton], + // automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, ) -> Document { use crate::bucket_sort::SimpleMatch; - let highlights = highlights_from_raw_document( - &raw_document, - automatons, - arena, - searchable_attrs, - ); + // let highlights = highlights_from_raw_document( + // &raw_document, + // automatons, + // arena, + // searchable_attrs, + // ); + + let highlights = Vec::new(); let mut matches = Vec::new(); for sm in raw_document.processed_matches { diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index f047de8e8..56fde3e7b 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,5 +1,4 @@ use compact_arena::SmallArena; -use itertools::EitherOrBoth; use sdset::SetBuf; use crate::DocIndex; use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; @@ -19,10 +18,9 @@ pub struct RawDocument<'a, 'tag> { impl<'a, 'tag> RawDocument<'a, 'tag> { pub fn new<'txn>( bare_matches: &'a mut [BareMatch<'tag>], - automatons: &[QueryWordAutomaton], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, - ) -> Option> + ) -> RawDocument<'a, 'tag> { if let Some(reordered_attrs) = searchable_attrs { for bm in bare_matches.iter() { @@ -42,70 +40,12 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { bare_matches.sort_unstable_by_key(|m| m.query_index); - let mut previous_word = None; - for i in 0..bare_matches.len() { - let a = &bare_matches[i]; - let auta = &automatons[a.query_index as usize]; - - match auta.phrase_query { - Some((0, _)) => { - let b = match bare_matches.get(i + 1) { - Some(b) => b, - None => { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue; - } - }; - - if a.query_index + 1 != b.query_index { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue - } - - let pla = &postings_lists[a.postings_list]; - let plb = &postings_lists[b.postings_list]; - - let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { - a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) - }); - - let mut newa = Vec::new(); - let mut newb = Vec::new(); - - for eb in iter { - if let EitherOrBoth::Both(a, b) = eb { - newa.push(*a); - newb.push(*b); - } - } - - if !newa.is_empty() { - previous_word = Some(a.query_index); - } - - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); - postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); - }, - Some((1, _)) => { - if previous_word.take() != Some(a.query_index - 1) { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - } - }, - Some((_, _)) => unreachable!(), - None => (), - } - } - - if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { - return None - } - - Some(RawDocument { + RawDocument { id: bare_matches[0].document_id, bare_matches, processed_matches: Vec::new(), processed_distances: Vec::new(), contains_one_word_field: false, - }) + } } }