From 9c03bb3428d574a15e3770ddc18d81da477dfb91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 9 Dec 2019 15:30:14 +0100 Subject: [PATCH] First probably working phrase query doc filtering --- Cargo.lock | 1 + meilisearch-core/Cargo.toml | 1 + meilisearch-core/src/bucket_sort.rs | 71 ++++++++++++++++++++++++++--- meilisearch-core/src/criterion2.rs | 50 +++++++++++++++----- 4 files changed, 106 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8034a4add..ad3f3494b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -952,6 +952,7 @@ dependencies = [ "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 62da7cfb8..3455f755d 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -17,6 +17,7 @@ env_logger = "0.7.0" fst = { version = "0.3.5", default-features = false } hashbrown = { version = "0.6.0", features = ["serde"] } heed = "0.6.1" +itertools = "0.8.2" # kill me please levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.8" meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 303e94e50..ae0fdf63f 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -59,11 +59,9 @@ pub fn bucket_sort<'c>( let before_raw_documents_building = Instant::now(); let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - raw_documents.push(RawDocument { - raw_matches, - processed_matches: Vec::new(), - processed_distances: Vec::new(), - }); + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { + raw_documents.push(raw_document); + } } debug!("creating {} candidates documents took {:.02?}", raw_documents.len(), @@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> { pub processed_distances: Vec>, } +impl<'a, 'tag> RawDocument<'a, 'tag> { + fn new<'txn>( + raw_matches: &'a mut [BareMatch<'tag>], + automatons: &[QueryWordAutomaton], + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Option> + { + raw_matches.sort_unstable_by_key(|m| m.query_index); + + // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches); + + let mut previous_word = None; + for i in 0..raw_matches.len() { + let a = &raw_matches[i]; + let auta = &automatons[a.query_index as usize]; + + match auta.phrase_query { + Some((0, _)) => { + previous_word = Some(a.query_index); + let b = raw_matches.get(i + 1)?; + if a.query_index + 1 != b.query_index { + return None; + } + + let pla = &postings_lists[a.postings_list]; + let plb = &postings_lists[b.postings_list]; + + let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) + }); + + if !iter.any(|eb| eb.is_both()) { return None } + }, + Some((1, _)) => { + if previous_word.take() != Some(a.query_index - 1) { + return None; + } + }, + Some((_, _)) => unreachable!(), + None => (), + } + } + + Some(RawDocument { + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), + }) + } +} + pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, @@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> { len: usize, } +impl fmt::Debug for PostingsListView<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PostingsListView") + .field("input", &std::str::from_utf8(&self.input).unwrap()) + .field("postings_list", &self.as_ref()) + .finish() + } +} + impl<'txn> PostingsListView<'txn> { pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { let len = postings_list.len(); @@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>( let input = Rc::from(input); let postings_list = Rc::new(postings_list); let postings_list_view = PostingsListView::new(input, postings_list); + let mut offset = 0; for group in postings_list_view.linear_group_by_key(|di| di.document_id) { @@ -442,7 +501,7 @@ fn construct_automatons2( } } - if false && n == 1 { + if true && n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 4c40b9969..3bfbe76ea 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -43,16 +43,42 @@ pub trait Criterion { } } -fn prepare_query_distances( - documents: &mut [RawDocument], +fn prepare_query_distances<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], + postings_lists: &PostingsListsArena<'tag, 'txn>, ) { for document in documents { if !document.processed_distances.is_empty() { continue } + // debug!("{:?}", document.raw_matches[0].document_id); + let mut processed = Vec::new(); - for m in document.raw_matches.iter() { + let mut raw_matches = document.raw_matches.iter().peekable(); + while let Some(m) = raw_matches.next() { + + // let automaton = &automatons[m.query_index as usize]; + + // debug!("{:?} {:?}", m, automaton); + // debug!("{:?}", &postings_lists[m.postings_list]); + + // match automaton.phrase_query { + // Some((0, len)) => { + // match raw_matches.peek() { + // Some(BareMatch { query_index, .. }) => { + // if *query_index != m.query_index + 1 { + // raw_matches.next(); + // continue + // } + // }, + // None => continue, + // } + // }, + // Some((_, _)) => continue, + // None => (), + // } + // FIXME we really need to take splitted words into account // those must be seen at the same level as the non-splitteds // if automatons[m.query_index as usize].phrase_query.is_some() { @@ -73,6 +99,8 @@ fn prepare_query_distances( } } + // debug!("{:?}", processed); + document.processed_distances = processed; } } @@ -82,14 +110,14 @@ pub struct Typo; impl Criterion for Typo { fn name(&self) -> &str { "typo" } - fn prepare( + fn prepare<'a, 'tag, 'txn>( &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer, automatons); + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); } fn evaluate( @@ -140,14 +168,14 @@ pub struct Words; impl Criterion for Words { fn name(&self) -> &str { "words" } - fn prepare( + fn prepare<'a, 'tag, 'txn>( &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer, automatons); + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); } fn evaluate(