Introduce a basic working version of phrase query for splitting words

This commit is contained in:
Clément Renault 2019-10-22 18:15:43 +02:00
parent 0fbd4cd632
commit 03eb7898e7
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 252 additions and 58 deletions

View File

@ -2,7 +2,7 @@ mod dfa;
mod query_enhancer; mod query_enhancer;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::vec; use std::{cmp, vec};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
@ -18,7 +18,7 @@ use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3; const NGRAMS: usize = 3;
pub struct AutomatonProducer { pub struct AutomatonProducer {
automatons: Vec<Vec<Automaton>>, automatons: Vec<AutomatonGroup>,
} }
impl AutomatonProducer { impl AutomatonProducer {
@ -26,19 +26,26 @@ impl AutomatonProducer {
reader: &heed::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
postings_list_store: store::PostingsLists,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> { ) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) = let (automatons, query_enhancer) =
generate_automatons(reader, query, main_store, synonyms_store)?; generate_automatons(reader, query, main_store, postings_list_store, synonyms_store)?;
Ok((AutomatonProducer { automatons }, query_enhancer)) Ok((AutomatonProducer { automatons }, query_enhancer))
} }
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> { pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
self.automatons.into_iter() self.automatons.into_iter()
} }
} }
#[derive(Debug)]
pub enum AutomatonGroup {
Normal(Vec<Automaton>),
PhraseQuery(Vec<Automaton>),
}
#[derive(Debug)] #[derive(Debug)]
pub struct Automaton { pub struct Automaton {
pub index: usize, pub index: usize,
@ -102,12 +109,42 @@ pub fn normalize_str(string: &str) -> String {
string string
} }
fn split_best_frequency<'a>(
reader: &heed::RoTxn,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn generate_automatons( fn generate_automatons(
reader: &heed::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms, synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> { ) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)>
{
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? { let synonyms = match main_store.synonyms_fst(reader)? {
@ -136,7 +173,7 @@ fn generate_automatons(
original_automatons.push(automaton); original_automatons.push(automaton);
} }
automatons.push(original_automatons); automatons.push(AutomatonGroup::Normal(original_automatons));
for n in 1..=NGRAMS { for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable(); let mut ngrams = query_words.windows(n).enumerate().peekable();
@ -188,13 +225,25 @@ fn generate_automatons(
Automaton::non_exact(automaton_index, n, synonym) Automaton::non_exact(automaton_index, n, synonym)
}; };
automaton_index += 1; automaton_index += 1;
automatons.push(vec![automaton]); automatons.push(AutomatonGroup::Normal(vec![automaton]));
} }
} }
} }
} }
if n != 1 { if n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let a = Automaton::exact(automaton_index, 1, left);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
let b = Automaton::exact(automaton_index, 1, right);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(AutomatonGroup::PhraseQuery(vec![a, b]));
}
} else {
// automaton of concatenation of query words // automaton of concatenation of query words
let concat = ngram_slice.concat(); let concat = ngram_slice.concat();
let normalized = normalize_str(&concat); let normalized = normalize_str(&concat);
@ -204,15 +253,18 @@ fn generate_automatons(
let automaton = Automaton::exact(automaton_index, n, &normalized); let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1; automaton_index += 1;
automatons.push(vec![automaton]); automatons.push(AutomatonGroup::Normal(vec![automaton]));
} }
} }
} }
// order automatons, the most important first, // order automatons, the most important first,
// we keep the original automatons at the front. // we keep the original automatons at the front.
automatons[1..].sort_by_key(|a| { automatons[1..].sort_by_key(|group| {
let a = a.first().unwrap(); let a = match group {
AutomatonGroup::Normal(group) => group.first().unwrap(),
AutomatonGroup::PhraseQuery(group) => group.first().unwrap(),
};
(Reverse(a.is_exact), a.ngram) (Reverse(a.is_exact), a.ngram)
}); });

View File

@ -8,7 +8,7 @@ use fst::{IntoStreamer, Streamer};
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut}; use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::{raw_documents_from, RawDocument}; use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
@ -138,7 +138,7 @@ fn multiword_rewrite_matches(
fn fetch_raw_documents( fn fetch_raw_documents(
reader: &heed::RoTxn, reader: &heed::RoTxn,
automatons: &[Automaton], automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>, searchables: Option<&ReorderedAttrs>,
main_store: store::Main, main_store: store::Main,
@ -148,52 +148,127 @@ fn fetch_raw_documents(
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut highlights = Vec::new(); let mut highlights = Vec::new();
for automaton in automatons { for group in automatons_groups {
let Automaton { match group {
index, AutomatonGroup::Normal(automatons) => {
is_exact, for automaton in automatons {
query_len, let Automaton { index, is_exact, query_len, .. } = automaton;
.. let dfa = automaton.dfa();
} = automaton;
let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? { let words = match main_store.words_fst(reader)? {
Some(words) => words, Some(words) => words,
None => return Ok(Vec::new()), None => return Ok(Vec::new()),
};
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
matches.reserve(doc_indexes.len());
highlights.reserve(doc_indexes.len());
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
}; };
let highlight = Highlight { let mut stream = words.search(&dfa).into_stream();
attribute: di.attribute, while let Some(input) = stream.next() {
char_index: di.char_index, let distance = dfa.eval(input).to_u8();
char_length: di.char_length, let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
matches.reserve(doc_indexes.len());
highlights.reserve(doc_indexes.len());
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: di.char_length,
};
matches.push((di.document_id, match_));
highlights.push((di.document_id, highlight));
}
}
}
}
},
AutomatonGroup::PhraseQuery(automatons) => {
let mut tmp_matches = Vec::new();
let phrase_query_len = automatons.len();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton { index, is_exact, query_len, .. } = automaton;
let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
}; };
matches.push((di.document_id, match_)); let mut stream = words.search(&dfa).into_stream();
highlights.push((di.document_id, highlight)); while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
tmp_matches.reserve(doc_indexes.len());
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: di.char_length,
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
}
}
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
} }
} }
} }
@ -368,14 +443,14 @@ where
let mut raw_documents_processed = Vec::with_capacity(range.len()); let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) = let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?; AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?;
let automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
for auts in automaton_producer { for auts in automaton_producer {
automatons.extend(auts); automatons.push(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
// with the current automatons // with the current automatons
@ -481,14 +556,14 @@ where
let mut raw_documents_processed = Vec::new(); let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) = let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?; AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?;
let automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
for auts in automaton_producer { for auts in automaton_producer {
automatons.extend(auts); automatons.push(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
// with the current automatons // with the current automatons
@ -1697,4 +1772,71 @@ mod tests {
}); });
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }
#[test]
fn simple_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("engine", &[doc_index(0, 1)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("engine", &[doc_index(1, 2)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
#[test]
fn harder_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("search", &[doc_index(0, 1)][..]),
("engine", &[doc_index(0, 2)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("search", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
("search", &[doc_index(1, 0)][..]),
("search", &[doc_index(1, 1)][..]),
("slow", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
} }