mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Introduce a query tree context struct
This commit is contained in:
parent
887c212b49
commit
d724a7659e
@ -29,6 +29,7 @@ use crate::raw_document::RawDocument;
|
|||||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||||
use crate::{store, Document, DocumentId, MResult};
|
use crate::{store, Document, DocumentId, MResult};
|
||||||
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
|
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
|
||||||
|
use crate::query_tree::Context as QTContext;
|
||||||
|
|
||||||
pub fn bucket_sort<'c, FI>(
|
pub fn bucket_sort<'c, FI>(
|
||||||
reader: &heed::RoTxn<MainT>,
|
reader: &heed::RoTxn<MainT>,
|
||||||
@ -47,22 +48,23 @@ pub fn bucket_sort<'c, FI>(
|
|||||||
where
|
where
|
||||||
FI: Fn(DocumentId) -> bool,
|
FI: Fn(DocumentId) -> bool,
|
||||||
{
|
{
|
||||||
let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap();
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||||
println!("{:?}", operation);
|
|
||||||
|
|
||||||
let words = match unsafe { main_store.static_words_fst(reader)? } {
|
|
||||||
Some(words) => words,
|
Some(words) => words,
|
||||||
None => return Ok(Vec::new()),
|
None => return Ok(Vec::new()),
|
||||||
};
|
};
|
||||||
|
|
||||||
let QueryResult { docids, queries } =
|
let context = QTContext {
|
||||||
traverse_query_tree(
|
words_set,
|
||||||
reader,
|
synonyms: synonyms_store,
|
||||||
&words,
|
postings_lists: postings_lists_store,
|
||||||
postings_lists_store,
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||||
prefix_postings_lists_cache_store,
|
};
|
||||||
&operation,
|
|
||||||
).unwrap();
|
let operation = create_query_tree(reader, &context, query).unwrap();
|
||||||
|
println!("{:?}", operation);
|
||||||
|
|
||||||
|
|
||||||
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
|
||||||
println!("found {} documents", docids.len());
|
println!("found {} documents", docids.len());
|
||||||
println!("number of postings {:?}", queries.len());
|
println!("number of postings {:?}", queries.len());
|
||||||
|
|
||||||
|
@ -93,26 +93,22 @@ pub struct PostingsList {
|
|||||||
matches: SetBuf<DocIndex>,
|
matches: SetBuf<DocIndex>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
pub struct Context {
|
pub struct Context {
|
||||||
pub synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
pub words_set: fst::Set,
|
||||||
pub postings: HashMap<String, PostingsList>,
|
pub synonyms: store::Synonyms,
|
||||||
|
pub postings_lists: store::PostingsLists,
|
||||||
|
pub prefix_postings_lists: store::PrefixPostingsListsCache,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn split_best_frequency<'a>(
|
fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'a str) -> MResult<Option<(&'a str, &'a str)>> {
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
postings_lists: store::PostingsLists,
|
|
||||||
word: &'a str,
|
|
||||||
) -> MResult<Option<(&'a str, &'a str)>>
|
|
||||||
{
|
|
||||||
let chars = word.char_indices().skip(1);
|
let chars = word.char_indices().skip(1);
|
||||||
let mut best = None;
|
let mut best = None;
|
||||||
|
|
||||||
for (i, _) in chars {
|
for (i, _) in chars {
|
||||||
let (left, right) = word.split_at(i);
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
let left_freq = postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0);
|
let left_freq = ctx.postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0);
|
||||||
let right_freq = postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0);
|
let right_freq = ctx.postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0);
|
||||||
|
|
||||||
let min_freq = cmp::min(left_freq, right_freq);
|
let min_freq = cmp::min(left_freq, right_freq);
|
||||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||||
@ -123,12 +119,7 @@ fn split_best_frequency<'a>(
|
|||||||
Ok(best.map(|(_, l, r)| (l, r)))
|
Ok(best.map(|(_, l, r)| (l, r)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_synonyms(
|
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
synonyms: store::Synonyms,
|
|
||||||
words: &[&str],
|
|
||||||
) -> MResult<Vec<Vec<String>>>
|
|
||||||
{
|
|
||||||
let words = words.join(" "); // TODO ugly
|
let words = words.join(" "); // TODO ugly
|
||||||
// synonyms.synonyms(reader, words.as_bytes()).cloned().unwrap_or_default()
|
// synonyms.synonyms(reader, words.as_bytes()).cloned().unwrap_or_default()
|
||||||
Ok(vec![])
|
Ok(vec![])
|
||||||
@ -154,13 +145,7 @@ where I: IntoIterator<Item=Operation>,
|
|||||||
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
|
|
||||||
pub fn create_query_tree(
|
pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str) -> MResult<Operation> {
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
postings_lists: store::PostingsLists,
|
|
||||||
synonyms: store::Synonyms,
|
|
||||||
query: &str,
|
|
||||||
) -> MResult<Operation>
|
|
||||||
{
|
|
||||||
let query = query.to_lowercase();
|
let query = query.to_lowercase();
|
||||||
|
|
||||||
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
||||||
@ -182,11 +167,11 @@ pub fn create_query_tree(
|
|||||||
let mut alts = Vec::new();
|
let mut alts = Vec::new();
|
||||||
match words {
|
match words {
|
||||||
[(id, word)] => {
|
[(id, word)] => {
|
||||||
let phrase = split_best_frequency(reader, postings_lists, word)?
|
let phrase = split_best_frequency(reader, ctx, word)?
|
||||||
.map(|ws| Query::phrase2(*id, is_last, ws))
|
.map(|ws| Query::phrase2(*id, is_last, ws))
|
||||||
.map(Operation::Query);
|
.map(Operation::Query);
|
||||||
|
|
||||||
let synonyms = fetch_synonyms(reader, synonyms, &[word])?.into_iter().map(|alts| {
|
let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| {
|
||||||
let iter = alts.into_iter().map(|w| Query::exact(*id, false, &w)).map(Operation::Query);
|
let iter = alts.into_iter().map(|w| Query::exact(*id, false, &w)).map(Operation::Query);
|
||||||
create_operation(iter, Operation::And)
|
create_operation(iter, Operation::And)
|
||||||
});
|
});
|
||||||
@ -200,7 +185,7 @@ pub fn create_query_tree(
|
|||||||
let id = words[0].0;
|
let id = words[0].0;
|
||||||
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
||||||
|
|
||||||
for synonym in fetch_synonyms(reader, synonyms, &words)? {
|
for synonym in fetch_synonyms(reader, ctx, &words)? {
|
||||||
let synonym = synonym.into_iter().map(|s| Operation::Query(Query::exact(id, false, &s)));
|
let synonym = synonym.into_iter().map(|s| Operation::Query(Query::exact(id, false, &s)));
|
||||||
let synonym = create_operation(synonym, Operation::And);
|
let synonym = create_operation(synonym, Operation::And);
|
||||||
alts.push(synonym);
|
alts.push(synonym);
|
||||||
@ -232,17 +217,13 @@ pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf<DocumentId>>;
|
|||||||
|
|
||||||
pub fn traverse_query_tree<'o, 'txn>(
|
pub fn traverse_query_tree<'o, 'txn>(
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
words_set: &fst::Set,
|
ctx: &Context,
|
||||||
postings_lists: store::PostingsLists,
|
|
||||||
prefix_postings_lists: store::PrefixPostingsListsCache,
|
|
||||||
tree: &'o Operation,
|
tree: &'o Operation,
|
||||||
) -> MResult<QueryResult<'o, 'txn>>
|
) -> MResult<QueryResult<'o, 'txn>>
|
||||||
{
|
{
|
||||||
fn execute_and<'o, 'txn>(
|
fn execute_and<'o, 'txn>(
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
words_set: &fst::Set,
|
ctx: &Context,
|
||||||
pls: store::PostingsLists,
|
|
||||||
ppls: store::PrefixPostingsListsCache,
|
|
||||||
cache: &mut Cache<'o, 'txn>,
|
cache: &mut Cache<'o, 'txn>,
|
||||||
postings: &mut Postings<'o, 'txn>,
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
@ -257,9 +238,9 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
for op in operations {
|
for op in operations {
|
||||||
if cache.get(op).is_none() {
|
if cache.get(op).is_none() {
|
||||||
let docids = match op {
|
let docids = match op {
|
||||||
Operation::And(ops) => execute_and(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?,
|
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Or(ops) => execute_or(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?,
|
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Query(query) => execute_query(reader, words_set, pls, ppls, postings, depth + 1, &query)?,
|
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||||
};
|
};
|
||||||
cache.insert(op, docids);
|
cache.insert(op, docids);
|
||||||
}
|
}
|
||||||
@ -281,9 +262,7 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
fn execute_or<'o, 'txn>(
|
fn execute_or<'o, 'txn>(
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
words_set: &fst::Set,
|
ctx: &Context,
|
||||||
pls: store::PostingsLists,
|
|
||||||
ppls: store::PrefixPostingsListsCache,
|
|
||||||
cache: &mut Cache<'o, 'txn>,
|
cache: &mut Cache<'o, 'txn>,
|
||||||
postings: &mut Postings<'o, 'txn>,
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
@ -300,9 +279,9 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
Some(docids) => docids,
|
Some(docids) => docids,
|
||||||
None => {
|
None => {
|
||||||
let docids = match op {
|
let docids = match op {
|
||||||
Operation::And(ops) => execute_and(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?,
|
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Or(ops) => execute_or(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?,
|
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Query(query) => execute_query(reader, words_set, pls, ppls, postings, depth + 1, &query)?,
|
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||||
};
|
};
|
||||||
cache.entry(op).or_insert(docids)
|
cache.entry(op).or_insert(docids)
|
||||||
}
|
}
|
||||||
@ -320,9 +299,7 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
fn execute_query<'o, 'txn>(
|
fn execute_query<'o, 'txn>(
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
words_set: &fst::Set,
|
ctx: &Context,
|
||||||
pls: store::PostingsLists,
|
|
||||||
ppls: store::PrefixPostingsListsCache,
|
|
||||||
postings: &mut Postings<'o, 'txn>,
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
query: &'o Query,
|
query: &'o Query,
|
||||||
@ -335,7 +312,7 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
QueryKind::Tolerant(word) => {
|
QueryKind::Tolerant(word) => {
|
||||||
if *prefix && word.len() == 1 {
|
if *prefix && word.len() == 1 {
|
||||||
let prefix = [word.as_bytes()[0], 0, 0, 0];
|
let prefix = [word.as_bytes()[0], 0, 0, 0];
|
||||||
let matches = ppls.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
||||||
|
|
||||||
let before = Instant::now();
|
let before = Instant::now();
|
||||||
let mut docids: Vec<_> = matches.into_iter().map(|m| m.document_id).collect();
|
let mut docids: Vec<_> = matches.into_iter().map(|m| m.document_id).collect();
|
||||||
@ -349,14 +326,14 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
let byte = word.as_bytes()[0];
|
let byte = word.as_bytes()[0];
|
||||||
let mut stream = if byte == u8::max_value() {
|
let mut stream = if byte == u8::max_value() {
|
||||||
words_set.search(&dfa).ge(&[byte]).into_stream()
|
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||||
} else {
|
} else {
|
||||||
words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut docids = Vec::new();
|
let mut docids = Vec::new();
|
||||||
while let Some(input) = stream.next() {
|
while let Some(input) = stream.next() {
|
||||||
if let Some(matches) = pls.postings_list(reader, input)? {
|
if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
docids.extend(matches.iter().map(|d| d.document_id))
|
docids.extend(matches.iter().map(|d| d.document_id))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -374,14 +351,14 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
let byte = word.as_bytes()[0];
|
let byte = word.as_bytes()[0];
|
||||||
let mut stream = if byte == u8::max_value() {
|
let mut stream = if byte == u8::max_value() {
|
||||||
words_set.search(&dfa).ge(&[byte]).into_stream()
|
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||||
} else {
|
} else {
|
||||||
words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut docids = Vec::new();
|
let mut docids = Vec::new();
|
||||||
while let Some(input) = stream.next() {
|
while let Some(input) = stream.next() {
|
||||||
if let Some(matches) = pls.postings_list(reader, input)? {
|
if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
docids.extend(matches.iter().map(|d| d.document_id))
|
docids.extend(matches.iter().map(|d| d.document_id))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -395,8 +372,8 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
QueryKind::Phrase(words) => {
|
QueryKind::Phrase(words) => {
|
||||||
// TODO support prefix and non-prefix exact DFA
|
// TODO support prefix and non-prefix exact DFA
|
||||||
if let [first, second] = words.as_slice() {
|
if let [first, second] = words.as_slice() {
|
||||||
let first = pls.postings_list(reader, first.as_bytes())?.unwrap_or_default();
|
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
|
||||||
let second = pls.postings_list(reader, second.as_bytes())?.unwrap_or_default();
|
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
|
||||||
|
|
||||||
let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| {
|
let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| {
|
||||||
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
||||||
@ -435,9 +412,9 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
let mut postings = Postings::new();
|
let mut postings = Postings::new();
|
||||||
|
|
||||||
let docids = match tree {
|
let docids = match tree {
|
||||||
Operation::And(ops) => execute_and(reader, words_set, postings_lists, prefix_postings_lists, &mut cache, &mut postings, 0, &ops)?,
|
Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||||
Operation::Or(ops) => execute_or(reader, words_set, postings_lists, prefix_postings_lists, &mut cache, &mut postings, 0, &ops)?,
|
Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||||
Operation::Query(query) => execute_query(reader, words_set, postings_lists, prefix_postings_lists, &mut postings, 0, &query)?,
|
Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(QueryResult { docids, queries: postings })
|
Ok(QueryResult { docids, queries: postings })
|
||||||
|
Loading…
Reference in New Issue
Block a user