Improve query parsing and interpretation

This commit is contained in:
Clément Renault 2020-01-16 14:11:17 +01:00
parent 70d4f47f37
commit 3912d1ec4b
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 31 additions and 21 deletions

View File

@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::RawDocument;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult, PostingsKey};
use crate::query_tree::Context as QTContext;
use crate::store::Postings;
@ -98,7 +98,7 @@ where
let mut bare_matches = Vec::new();
mk_arena!(arena);
for ((query, input, distance), matches) in queries {
for (PostingsKey{ query, input, distance, is_exact }, matches) in queries {
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
let pllen = postings_list_view.len() as f32;
@ -115,7 +115,7 @@ where
document_id,
query_index: query.id,
distance,
is_exact: true, // TODO where can I find this info?
is_exact,
postings_list: posting_list_index,
};
@ -166,7 +166,6 @@ where
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
let before_raw_documents_building = Instant::now();
let mut prefiltered_documents = 0;
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());

View File

@ -5,10 +5,11 @@ use std::ops::Range;
use std::time::Instant;
use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use meilisearch_tokenizer::split_query_string;
use sdset::{Set, SetBuf, SetOperation};
use slice_group_by::StrGroupBy;
use itertools::{EitherOrBoth, merge_join_by};
use fst::{IntoStreamer, Streamer};
use crate::database::MainT;
use crate::{store, DocumentId, DocIndex, MResult};
@ -183,8 +184,7 @@ pub fn create_query_tree(
query: &str,
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{
let query = query.to_lowercase();
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
let words = split_query_string(query).map(str::to_lowercase);
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
@ -270,14 +270,22 @@ pub fn create_query_tree(
}
}
let mapping = mapper.mapping();
let operation = create_operation(ngrams, Operation::Or);
let mapping = mapper.mapping();
Ok((operation, mapping))
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PostingsKey<'o> {
pub query: &'o Query,
pub input: Vec<u8>,
pub distance: u8,
pub is_exact: bool,
}
pub type Distance = u8;
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>, Distance), Cow<'txn, Set<DocIndex>>>;
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub struct QueryResult<'o, 'txn> {
@ -392,18 +400,18 @@ pub fn traverse_query_tree<'o, 'txn>(
let mut docids = Vec::new();
// We retrieve the cached postings list for all
// We retrieve the cached postings lists for all
// the words that starts with this short prefix.
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
let distance = 0;
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
postings.insert(key, result.matches);
docids.extend_from_slice(&result.docids);
// We retrieve the exact postings list for the prefix,
// because we must consider these matches as exact.
if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? {
let distance = 0;
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
postings.insert(key, result.matches);
docids.extend_from_slice(&result.docids);
}
@ -426,10 +434,12 @@ pub fn traverse_query_tree<'o, 'txn>(
let before = Instant::now();
let mut docids = Vec::new();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
let is_exact = *prefix == false && distance == 0 && input.len() == word.len();
docids.extend_from_slice(&result.docids);
postings.insert((query, input.to_owned(), distance), result.matches);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
postings.insert(key, result.matches);
}
}
println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
@ -454,10 +464,11 @@ pub fn traverse_query_tree<'o, 'txn>(
let mut docids = Vec::new();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
docids.extend_from_slice(&result.docids);
postings.insert((query, input.to_owned(), distance), result.matches);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: true };
postings.insert(key, result.matches);
}
}
@ -491,8 +502,8 @@ pub fn traverse_query_tree<'o, 'txn>(
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
let distance = 0;
postings.insert((query, vec![], distance), matches);
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
postings.insert(key, matches);
Cow::Owned(docids)
} else {