mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Improve query parsing and interpretation
This commit is contained in:
parent
70d4f47f37
commit
3912d1ec4b
@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
|||||||
use crate::raw_document::RawDocument;
|
use crate::raw_document::RawDocument;
|
||||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||||
use crate::{store, Document, DocumentId, MResult};
|
use crate::{store, Document, DocumentId, MResult};
|
||||||
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
|
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult, PostingsKey};
|
||||||
use crate::query_tree::Context as QTContext;
|
use crate::query_tree::Context as QTContext;
|
||||||
use crate::store::Postings;
|
use crate::store::Postings;
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ where
|
|||||||
let mut bare_matches = Vec::new();
|
let mut bare_matches = Vec::new();
|
||||||
mk_arena!(arena);
|
mk_arena!(arena);
|
||||||
|
|
||||||
for ((query, input, distance), matches) in queries {
|
for (PostingsKey{ query, input, distance, is_exact }, matches) in queries {
|
||||||
|
|
||||||
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
||||||
let pllen = postings_list_view.len() as f32;
|
let pllen = postings_list_view.len() as f32;
|
||||||
@ -115,7 +115,7 @@ where
|
|||||||
document_id,
|
document_id,
|
||||||
query_index: query.id,
|
query_index: query.id,
|
||||||
distance,
|
distance,
|
||||||
is_exact: true, // TODO where can I find this info?
|
is_exact,
|
||||||
postings_list: posting_list_index,
|
postings_list: posting_list_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -166,7 +166,6 @@ where
|
|||||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||||
|
|
||||||
let before_raw_documents_building = Instant::now();
|
let before_raw_documents_building = Instant::now();
|
||||||
let mut prefiltered_documents = 0;
|
|
||||||
let mut raw_documents = Vec::new();
|
let mut raw_documents = Vec::new();
|
||||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||||
|
@ -5,10 +5,11 @@ use std::ops::Range;
|
|||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use std::{cmp, fmt, iter::once};
|
use std::{cmp, fmt, iter::once};
|
||||||
|
|
||||||
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
|
use meilisearch_tokenizer::split_query_string;
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
use itertools::{EitherOrBoth, merge_join_by};
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
|
||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::{store, DocumentId, DocIndex, MResult};
|
use crate::{store, DocumentId, DocIndex, MResult};
|
||||||
@ -183,8 +184,7 @@ pub fn create_query_tree(
|
|||||||
query: &str,
|
query: &str,
|
||||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
{
|
{
|
||||||
let query = query.to_lowercase();
|
let words = split_query_string(query).map(str::to_lowercase);
|
||||||
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
|
||||||
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
|
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
|
||||||
|
|
||||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
@ -270,14 +270,22 @@ pub fn create_query_tree(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mapping = mapper.mapping();
|
|
||||||
let operation = create_operation(ngrams, Operation::Or);
|
let operation = create_operation(ngrams, Operation::Or);
|
||||||
|
let mapping = mapper.mapping();
|
||||||
|
|
||||||
Ok((operation, mapping))
|
Ok((operation, mapping))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct PostingsKey<'o> {
|
||||||
|
pub query: &'o Query,
|
||||||
|
pub input: Vec<u8>,
|
||||||
|
pub distance: u8,
|
||||||
|
pub is_exact: bool,
|
||||||
|
}
|
||||||
|
|
||||||
pub type Distance = u8;
|
pub type Distance = u8;
|
||||||
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>, Distance), Cow<'txn, Set<DocIndex>>>;
|
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
|
||||||
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
|
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
|
||||||
|
|
||||||
pub struct QueryResult<'o, 'txn> {
|
pub struct QueryResult<'o, 'txn> {
|
||||||
@ -392,18 +400,18 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
let mut docids = Vec::new();
|
let mut docids = Vec::new();
|
||||||
|
|
||||||
// We retrieve the cached postings list for all
|
// We retrieve the cached postings lists for all
|
||||||
// the words that starts with this short prefix.
|
// the words that starts with this short prefix.
|
||||||
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
||||||
let distance = 0;
|
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
|
||||||
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
|
postings.insert(key, result.matches);
|
||||||
docids.extend_from_slice(&result.docids);
|
docids.extend_from_slice(&result.docids);
|
||||||
|
|
||||||
// We retrieve the exact postings list for the prefix,
|
// We retrieve the exact postings list for the prefix,
|
||||||
// because we must consider these matches as exact.
|
// because we must consider these matches as exact.
|
||||||
if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? {
|
if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? {
|
||||||
let distance = 0;
|
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
|
||||||
postings.insert((query, word.clone().into_bytes(), distance), result.matches);
|
postings.insert(key, result.matches);
|
||||||
docids.extend_from_slice(&result.docids);
|
docids.extend_from_slice(&result.docids);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -426,10 +434,12 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
let before = Instant::now();
|
let before = Instant::now();
|
||||||
let mut docids = Vec::new();
|
let mut docids = Vec::new();
|
||||||
while let Some(input) = stream.next() {
|
while let Some(input) = stream.next() {
|
||||||
let distance = dfa.eval(input).to_u8();
|
|
||||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
|
let distance = dfa.eval(input).to_u8();
|
||||||
|
let is_exact = *prefix == false && distance == 0 && input.len() == word.len();
|
||||||
docids.extend_from_slice(&result.docids);
|
docids.extend_from_slice(&result.docids);
|
||||||
postings.insert((query, input.to_owned(), distance), result.matches);
|
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||||
@ -454,10 +464,11 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
|
|
||||||
let mut docids = Vec::new();
|
let mut docids = Vec::new();
|
||||||
while let Some(input) = stream.next() {
|
while let Some(input) = stream.next() {
|
||||||
let distance = dfa.eval(input).to_u8();
|
|
||||||
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
|
let distance = dfa.eval(input).to_u8();
|
||||||
docids.extend_from_slice(&result.docids);
|
docids.extend_from_slice(&result.docids);
|
||||||
postings.insert((query, input.to_owned(), distance), result.matches);
|
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: true };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -491,8 +502,8 @@ pub fn traverse_query_tree<'o, 'txn>(
|
|||||||
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||||
|
|
||||||
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
|
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
|
||||||
let distance = 0;
|
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
|
||||||
postings.insert((query, vec![], distance), matches);
|
postings.insert(key, matches);
|
||||||
|
|
||||||
Cow::Owned(docids)
|
Cow::Owned(docids)
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
Reference in New Issue
Block a user