mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 21:20:24 +01:00
Improve query parsing and interpretation
This commit is contained in:
parent
70d4f47f37
commit
3912d1ec4b
@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::RawDocument;
|
||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||
use crate::{store, Document, DocumentId, MResult};
|
||||
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
|
||||
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult, PostingsKey};
|
||||
use crate::query_tree::Context as QTContext;
|
||||
use crate::store::Postings;
|
||||
|
||||
@ -98,7 +98,7 @@ where
|
||||
let mut bare_matches = Vec::new();
|
||||
mk_arena!(arena);
|
||||
|
||||
for ((query, input, distance), matches) in queries {
|
||||
for (PostingsKey{ query, input, distance, is_exact }, matches) in queries {
|
||||
|
||||
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
||||
let pllen = postings_list_view.len() as f32;
|
||||
@ -115,7 +115,7 @@ where
|
||||
document_id,
|
||||
query_index: query.id,
|
||||
distance,
|
||||
is_exact: true, // TODO where can I find this info?
|
||||
is_exact,
|
||||
postings_list: posting_list_index,
|
||||
};
|
||||
|
||||
@ -166,7 +166,6 @@ where
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut prefiltered_documents = 0;
|
||||
let mut raw_documents = Vec::new();
|
||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||
|
@ -5,10 +5,11 @@ use std::ops::Range;
|
||||
use std::time::Instant;
|
||||
use std::{cmp, fmt, iter::once};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use itertools::{EitherOrBoth, merge_join_by};
|
||||
use meilisearch_tokenizer::split_query_string;
|
||||
use sdset::{Set, SetBuf, SetOperation};
|
||||
use slice_group_by::StrGroupBy;
|
||||
use itertools::{EitherOrBoth, merge_join_by};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::{store, DocumentId, DocIndex, MResult};
|
||||
@ -183,8 +184,7 @@ pub fn create_query_tree(
|
||||
query: &str,
|
||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||
{
|
||||
let query = query.to_lowercase();
|
||||
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
||||
let words = split_query_string(query).map(str::to_lowercase);
|
||||
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
|
||||
|
||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||
@ -270,14 +270,22 @@ pub fn create_query_tree(
|
||||
}
|
||||
}
|
||||
|
||||
let mapping = mapper.mapping();
|
||||
let operation = create_operation(ngrams, Operation::Or);
|
||||
let mapping = mapper.mapping();
|
||||
|
||||
Ok((operation, mapping))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PostingsKey<'o> {
|
||||
pub query: &'o Query,
|
||||
pub input: Vec<u8>,
|
||||
pub distance: u8,
|
||||
pub is_exact: bool,
|
||||
}
|
||||
|
||||
pub type Distance = u8;
|
||||
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>, Distance), Cow<'txn, Set<DocIndex>>>;
|
||||
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
|
||||
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
|
||||
|
||||
pub struct QueryResult<'o, 'txn> {
|
||||
@ -392,18 +400,18 @@ pub fn traverse_query_tree<'o, 'txn>(
|
||||
|
||||
let mut docids = Vec::new();
|
||||
|
||||
// We retrieve the cached postings list for all
|
||||
// We retrieve the cached postings lists for all
|
||||
// the words that starts with this short prefix.
|
||||
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
||||
let distance = 0;
|
||||
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
|
||||
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
|
||||
postings.insert(key, result.matches);
|
||||
docids.extend_from_slice(&result.docids);
|
||||
|
||||
// We retrieve the exact postings list for the prefix,
|
||||
// because we must consider these matches as exact.
|
||||
if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? {
|
||||
let distance = 0;
|
||||
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
|
||||
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
|
||||
postings.insert(key, result.matches);
|
||||
docids.extend_from_slice(&result.docids);
|
||||
}
|
||||
|
||||
@ -426,10 +434,12 @@ pub fn traverse_query_tree<'o, 'txn>(
|
||||
let before = Instant::now();
|
||||
let mut docids = Vec::new();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *prefix == false && distance == 0 && input.len() == word.len();
|
||||
docids.extend_from_slice(&result.docids);
|
||||
postings.insert((query, input.to_owned(), distance), result.matches);
|
||||
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
|
||||
postings.insert(key, result.matches);
|
||||
}
|
||||
}
|
||||
println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||
@ -454,10 +464,11 @@ pub fn traverse_query_tree<'o, 'txn>(
|
||||
|
||||
let mut docids = Vec::new();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
docids.extend_from_slice(&result.docids);
|
||||
postings.insert((query, input.to_owned(), distance), result.matches);
|
||||
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: true };
|
||||
postings.insert(key, result.matches);
|
||||
}
|
||||
}
|
||||
|
||||
@ -491,8 +502,8 @@ pub fn traverse_query_tree<'o, 'txn>(
|
||||
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||
|
||||
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
|
||||
let distance = 0;
|
||||
postings.insert((query, vec![], distance), matches);
|
||||
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
|
||||
postings.insert(key, matches);
|
||||
|
||||
Cow::Owned(docids)
|
||||
} else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user