From 7d9cf8d71330d129de997a5e776a221047e9281c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 23 Oct 2019 12:06:21 +0200 Subject: [PATCH] Clean up the fetch algorithm --- meilidb-core/src/automaton/mod.rs | 51 +++++--- meilidb-core/src/query_builder.rs | 207 +++++++++++++----------------- 2 files changed, 124 insertions(+), 134 deletions(-) diff --git a/meilidb-core/src/automaton/mod.rs b/meilidb-core/src/automaton/mod.rs index 4b5fa0604..d9c7fcee1 100644 --- a/meilidb-core/src/automaton/mod.rs +++ b/meilidb-core/src/automaton/mod.rs @@ -29,8 +29,13 @@ impl AutomatonProducer { postings_list_store: store::PostingsLists, synonyms_store: store::Synonyms, ) -> MResult<(AutomatonProducer, QueryEnhancer)> { - let (automatons, query_enhancer) = - generate_automatons(reader, query, main_store, postings_list_store, synonyms_store)?; + let (automatons, query_enhancer) = generate_automatons( + reader, + query, + main_store, + postings_list_store, + synonyms_store, + )?; Ok((AutomatonProducer { automatons }, query_enhancer)) } @@ -41,9 +46,25 @@ impl AutomatonProducer { } #[derive(Debug)] -pub enum AutomatonGroup { - Normal(Vec), - PhraseQuery(Vec), +pub struct AutomatonGroup { + pub is_phrase_query: bool, + pub automatons: Vec, +} + +impl AutomatonGroup { + fn normal(automatons: Vec) -> AutomatonGroup { + AutomatonGroup { + is_phrase_query: false, + automatons, + } + } + + fn phrase_query(automatons: Vec) -> AutomatonGroup { + AutomatonGroup { + is_phrase_query: true, + automatons, + } + } } #[derive(Debug)] @@ -143,8 +164,7 @@ fn generate_automatons( main_store: store::Main, postings_lists_store: store::PostingsLists, synonym_store: store::Synonyms, -) -> MResult<(Vec, QueryEnhancer)> -{ +) -> MResult<(Vec, QueryEnhancer)> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let synonyms = match main_store.synonyms_fst(reader)? { @@ -173,7 +193,7 @@ fn generate_automatons( original_automatons.push(automaton); } - automatons.push(AutomatonGroup::Normal(original_automatons)); + automatons.push(AutomatonGroup::normal(original_automatons)); for n in 1..=NGRAMS { let mut ngrams = query_words.windows(n).enumerate().peekable(); @@ -225,14 +245,16 @@ fn generate_automatons( Automaton::non_exact(automaton_index, n, synonym) }; automaton_index += 1; - automatons.push(AutomatonGroup::Normal(vec![automaton])); + automatons.push(AutomatonGroup::normal(vec![automaton])); } } } } if n == 1 { - if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { + if let Some((left, right)) = + split_best_frequency(reader, &normalized, postings_lists_store)? + { let a = Automaton::exact(automaton_index, 1, left); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; @@ -241,7 +263,7 @@ fn generate_automatons( enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; - automatons.push(AutomatonGroup::PhraseQuery(vec![a, b])); + automatons.push(AutomatonGroup::phrase_query(vec![a, b])); } } else { // automaton of concatenation of query words @@ -253,7 +275,7 @@ fn generate_automatons( let automaton = Automaton::exact(automaton_index, n, &normalized); automaton_index += 1; - automatons.push(AutomatonGroup::Normal(vec![automaton])); + automatons.push(AutomatonGroup::normal(vec![automaton])); } } } @@ -261,10 +283,7 @@ fn generate_automatons( // order automatons, the most important first, // we keep the original automatons at the front. automatons[1..].sort_by_key(|group| { - let a = match group { - AutomatonGroup::Normal(group) => group.first().unwrap(), - AutomatonGroup::PhraseQuery(group) => group.first().unwrap(), - }; + let a = group.automatons.first().unwrap(); (Reverse(a.is_exact), a.ngram) }); diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 21b28e663..ea97f75b8 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -149,128 +149,92 @@ fn fetch_raw_documents( let mut highlights = Vec::new(); for group in automatons_groups { - match group { - AutomatonGroup::Normal(automatons) => { - for automaton in automatons { - let Automaton { index, is_exact, query_len, .. } = automaton; - let dfa = automaton.dfa(); + let AutomatonGroup { + is_phrase_query, + automatons, + } = group; + let phrase_query_len = automatons.len(); - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; + let mut tmp_matches = Vec::new(); + for (id, automaton) in automatons.into_iter().enumerate() { + let Automaton { + index, + is_exact, + query_len, + .. + } = automaton; + let dfa = automaton.dfa(); - let mut stream = words.search(&dfa).into_stream(); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = *is_exact && distance == 0 && input.len() == *query_len; + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; - let doc_indexes = match postings_lists_store.postings_list(reader, input)? { - Some(doc_indexes) => doc_indexes, - None => continue, + let mut stream = words.search(&dfa).into_stream(); + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == *query_len; + + let doc_indexes = match postings_lists_store.postings_list(reader, input)? { + Some(doc_indexes) => doc_indexes, + None => continue, + }; + + tmp_matches.reserve(doc_indexes.len()); + + for di in doc_indexes.as_ref() { + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: *index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, }; - matches.reserve(doc_indexes.len()); - highlights.reserve(doc_indexes.len()); - - for di in doc_indexes.as_ref() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: *index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; - - matches.push((di.document_id, match_)); - highlights.push((di.document_id, highlight)); - } - } - } - } - }, - AutomatonGroup::PhraseQuery(automatons) => { - let mut tmp_matches = Vec::new(); - let phrase_query_len = automatons.len(); - - for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { index, is_exact, query_len, .. } = automaton; - let dfa = automaton.dfa(); - - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; - - let mut stream = words.search(&dfa).into_stream(); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = *is_exact && distance == 0 && input.len() == *query_len; - - let doc_indexes = match postings_lists_store.postings_list(reader, input)? { - Some(doc_indexes) => doc_indexes, - None => continue, + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, }; - tmp_matches.reserve(doc_indexes.len()); + tmp_matches.push((di.document_id, id, match_, highlight)); + } + } + } + } - for di in doc_indexes.as_ref() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: *index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; + if *is_phrase_query { + tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); + for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { + for window in group.windows(2) { + let (ida, ia, ma, ha) = window[0]; + let (idb, ib, mb, hb) = window[1]; - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; + debug_assert_eq!(ida, idb); - tmp_matches.push((di.document_id, id, match_, highlight)); - } - } - } - } - - tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); - for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { - for window in group.windows(2) { - let (ida, ia, ma, ha) = window[0]; - let (idb, ib, mb, hb) = window[1]; - - debug_assert_eq!(ida, idb); - - // if matches must follow and actually follows themselves - if ia + 1 == ib && ma.word_index + 1 == mb.word_index { - - // TODO we must make it work for phrase query longer than 2 - // if the second match is the last phrase query word - if ib + 1 == phrase_query_len { - // insert first match - matches.push((ida, ma)); - highlights.push((ida, ha)); - - // insert second match - matches.push((idb, mb)); - highlights.push((idb, hb)); - } + // if matches must follow and actually follows themselves + if ia + 1 == ib && ma.word_index + 1 == mb.word_index { + // TODO we must make it work for phrase query longer than 2 + // if the second match is the last phrase query word + if ib + 1 == phrase_query_len { + // insert first match + matches.push((ida, ma)); + highlights.push((ida, ha)); + + // insert second match + matches.push((idb, mb)); + highlights.push((idb, hb)); } } } } + } else { + for (id, _, match_, highlight) in tmp_matches { + matches.push((id, match_)); + highlights.push((id, highlight)); + } } } @@ -442,8 +406,13 @@ where let start_processing = Instant::now(); let mut raw_documents_processed = Vec::with_capacity(range.len()); - let (automaton_producer, query_enhancer) = - AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?; + let (automaton_producer, query_enhancer) = AutomatonProducer::new( + reader, + query, + main_store, + postings_lists_store, + synonyms_store, + )?; let automaton_producer = automaton_producer.into_iter(); let mut automatons = Vec::new(); @@ -555,8 +524,13 @@ where let start_processing = Instant::now(); let mut raw_documents_processed = Vec::new(); - let (automaton_producer, query_enhancer) = - AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?; + let (automaton_producer, query_enhancer) = AutomatonProducer::new( + reader, + query, + main_store, + postings_lists_store, + synonyms_store, + )?; let automaton_producer = automaton_producer.into_iter(); let mut automatons = Vec::new(); @@ -1778,9 +1752,8 @@ mod tests { let store = TempDatabase::from_iter(vec![ ("search", &[doc_index(0, 0)][..]), ("engine", &[doc_index(0, 1)][..]), - ("search", &[doc_index(1, 0)][..]), - ("slow", &[doc_index(1, 1)][..]), + ("slow", &[doc_index(1, 1)][..]), ("engine", &[doc_index(1, 2)][..]), ]); @@ -1806,15 +1779,13 @@ mod tests { ("search", &[doc_index(0, 0)][..]), ("search", &[doc_index(0, 1)][..]), ("engine", &[doc_index(0, 2)][..]), - ("search", &[doc_index(1, 0)][..]), - ("slow", &[doc_index(1, 1)][..]), + ("slow", &[doc_index(1, 1)][..]), ("search", &[doc_index(1, 2)][..]), ("engine", &[doc_index(1, 3)][..]), - ("search", &[doc_index(1, 0)][..]), ("search", &[doc_index(1, 1)][..]), - ("slow", &[doc_index(1, 2)][..]), + ("slow", &[doc_index(1, 2)][..]), ("engine", &[doc_index(1, 3)][..]), ]);