From 3912d1ec4b39bb7cc71c1e3e5f12453074ca2b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 Jan 2020 14:11:17 +0100 Subject: [PATCH] Improve query parsing and interpretation --- meilisearch-core/src/bucket_sort.rs | 7 ++--- meilisearch-core/src/query_tree.rs | 45 ++++++++++++++++++----------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index bebfa5a5f..bd3aac6fd 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; -use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult}; +use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult, PostingsKey}; use crate::query_tree::Context as QTContext; use crate::store::Postings; @@ -98,7 +98,7 @@ where let mut bare_matches = Vec::new(); mk_arena!(arena); - for ((query, input, distance), matches) in queries { + for (PostingsKey{ query, input, distance, is_exact }, matches) in queries { let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches)); let pllen = postings_list_view.len() as f32; @@ -115,7 +115,7 @@ where document_id, query_index: query.id, distance, - is_exact: true, // TODO where can I find this info? + is_exact, postings_list: posting_list_index, }; @@ -166,7 +166,6 @@ where debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); let before_raw_documents_building = Instant::now(); - let mut prefiltered_documents = 0; let mut raw_documents = Vec::new(); for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 079c2c0eb..d3a1ad0ec 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -5,10 +5,11 @@ use std::ops::Range; use std::time::Instant; use std::{cmp, fmt, iter::once}; +use fst::{IntoStreamer, Streamer}; +use itertools::{EitherOrBoth, merge_join_by}; +use meilisearch_tokenizer::split_query_string; use sdset::{Set, SetBuf, SetOperation}; use slice_group_by::StrGroupBy; -use itertools::{EitherOrBoth, merge_join_by}; -use fst::{IntoStreamer, Streamer}; use crate::database::MainT; use crate::{store, DocumentId, DocIndex, MResult}; @@ -183,8 +184,7 @@ pub fn create_query_tree( query: &str, ) -> MResult<(Operation, HashMap>)> { - let query = query.to_lowercase(); - let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned); + let words = split_query_string(query).map(str::to_lowercase); let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect(); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); @@ -270,14 +270,22 @@ pub fn create_query_tree( } } - let mapping = mapper.mapping(); let operation = create_operation(ngrams, Operation::Or); + let mapping = mapper.mapping(); Ok((operation, mapping)) } +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PostingsKey<'o> { + pub query: &'o Query, + pub input: Vec, + pub distance: u8, + pub is_exact: bool, +} + pub type Distance = u8; -pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec, Distance), Cow<'txn, Set>>; +pub type Postings<'o, 'txn> = HashMap, Cow<'txn, Set>>; pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set>>; pub struct QueryResult<'o, 'txn> { @@ -392,18 +400,18 @@ pub fn traverse_query_tree<'o, 'txn>( let mut docids = Vec::new(); - // We retrieve the cached postings list for all + // We retrieve the cached postings lists for all // the words that starts with this short prefix. let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); - let distance = 0; - postings.insert((query, word.clone().into_bytes(), distance), result.matches); + let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false }; + postings.insert(key, result.matches); docids.extend_from_slice(&result.docids); // We retrieve the exact postings list for the prefix, // because we must consider these matches as exact. if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? { - let distance = 0; - postings.insert((query, word.clone().into_bytes(), distance), result.matches); + let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true }; + postings.insert(key, result.matches); docids.extend_from_slice(&result.docids); } @@ -426,10 +434,12 @@ pub fn traverse_query_tree<'o, 'txn>( let before = Instant::now(); let mut docids = Vec::new(); while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); if let Some(result) = ctx.postings_lists.postings_list(reader, input)? { + let distance = dfa.eval(input).to_u8(); + let is_exact = *prefix == false && distance == 0 && input.len() == word.len(); docids.extend_from_slice(&result.docids); - postings.insert((query, input.to_owned(), distance), result.matches); + let key = PostingsKey { query, input: input.to_owned(), distance, is_exact }; + postings.insert(key, result.matches); } } println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2); @@ -454,10 +464,11 @@ pub fn traverse_query_tree<'o, 'txn>( let mut docids = Vec::new(); while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); if let Some(result) = ctx.postings_lists.postings_list(reader, input)? { + let distance = dfa.eval(input).to_u8(); docids.extend_from_slice(&result.docids); - postings.insert((query, input.to_owned(), distance), result.matches); + let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: true }; + postings.insert(key, result.matches); } } @@ -491,8 +502,8 @@ pub fn traverse_query_tree<'o, 'txn>( println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); let matches = Cow::Owned(SetBuf::new(matches).unwrap()); - let distance = 0; - postings.insert((query, vec![], distance), matches); + let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true }; + postings.insert(key, matches); Cow::Owned(docids) } else {