mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
Merge pull request #97 from meilisearch/criteria
Introduce all the criteria
This commit is contained in:
commit
2924ed31f3
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -866,6 +866,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"byte-unit",
|
||||
"heed",
|
||||
"jemallocator",
|
||||
"milli",
|
||||
"stderrlog",
|
||||
"structopt",
|
||||
|
@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use milli::facet::FacetValue;
|
||||
use milli::update::UpdateIndexingStep::*;
|
||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
|
||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition};
|
||||
|
||||
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
|
||||
|
||||
@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
Self { analyzer }
|
||||
}
|
||||
|
||||
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
|
||||
fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value {
|
||||
match value {
|
||||
Value::Null => Value::Null,
|
||||
Value::Bool(boolean) => Value::Bool(boolean),
|
||||
@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
let analyzed = self.analyzer.analyze(&old_string);
|
||||
for (word, token) in analyzed.reconstruct() {
|
||||
if token.is_word() {
|
||||
let to_highlight = words_to_highlight.contains(token.text());
|
||||
let to_highlight = matching_words.matches(token.text());
|
||||
if to_highlight { string.push_str("<mark>") }
|
||||
string.push_str(word);
|
||||
if to_highlight { string.push_str("</mark>") }
|
||||
@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
},
|
||||
Value::Array(values) => {
|
||||
Value::Array(values.into_iter()
|
||||
.map(|v| self.highlight_value(v, words_to_highlight))
|
||||
.map(|v| self.highlight_value(v, matching_words))
|
||||
.collect())
|
||||
},
|
||||
Value::Object(object) => {
|
||||
Value::Object(object.into_iter()
|
||||
.map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
|
||||
.map(|(k, v)| (k, self.highlight_value(v, matching_words)))
|
||||
.collect())
|
||||
},
|
||||
}
|
||||
@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
fn highlight_record(
|
||||
&self,
|
||||
object: &mut Map<String, Value>,
|
||||
words_to_highlight: &HashSet<String>,
|
||||
matching_words: &MatchingWords,
|
||||
attributes_to_highlight: &HashSet<String>,
|
||||
) {
|
||||
// TODO do we need to create a string for element that are not and needs to be highlight?
|
||||
for (key, value) in object.iter_mut() {
|
||||
if attributes_to_highlight.contains(key) {
|
||||
let old_value = mem::take(value);
|
||||
*value = self.highlight_value(old_value, words_to_highlight);
|
||||
*value = self.highlight_value(old_value, matching_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
search.facet_condition(condition);
|
||||
}
|
||||
|
||||
let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap();
|
||||
let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
|
||||
|
||||
let number_of_candidates = candidates.len();
|
||||
let facets = if query.facet_distribution == Some(true) {
|
||||
@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
||||
if !disable_highlighting {
|
||||
highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
|
||||
highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
|
||||
}
|
||||
|
||||
documents.push(object);
|
||||
|
@ -598,7 +598,7 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect();
|
||||
|
||||
let iter: Box<Iterator<Item = _>> = if internal_ids.is_empty() {
|
||||
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
|
||||
Box::new(index.documents.iter(rtxn)?.map(|result| {
|
||||
result.map(|(_id, obkv)| obkv)
|
||||
}))
|
||||
|
@ -3,8 +3,6 @@
|
||||
mod criterion;
|
||||
mod external_documents_ids;
|
||||
mod fields_ids_map;
|
||||
mod mdfs;
|
||||
mod query_tokens;
|
||||
mod search;
|
||||
mod update_store;
|
||||
pub mod facet;
|
||||
@ -28,7 +26,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
|
@ -1,163 +0,0 @@
|
||||
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
||||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use crate::Index;
|
||||
|
||||
/// A mana depth first search implementation.
|
||||
pub struct Mdfs<'a> {
|
||||
index: &'a Index,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
union_cache: HashMap<(usize, u8), RoaringBitmap>,
|
||||
candidates: RoaringBitmap,
|
||||
mana: u32,
|
||||
max_mana: u32,
|
||||
}
|
||||
|
||||
impl<'a> Mdfs<'a> {
|
||||
pub fn new(
|
||||
index: &'a Index,
|
||||
rtxn: &'a heed::RoTxn,
|
||||
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: RoaringBitmap,
|
||||
) -> Mdfs<'a>
|
||||
{
|
||||
// Compute the number of pairs (windows) we have for this list of words.
|
||||
let mana = words.len().saturating_sub(1) as u32;
|
||||
let max_mana = mana * 8;
|
||||
Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Mdfs<'a> {
|
||||
type Item = anyhow::Result<(u32, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// If there is less or only one word therefore the only
|
||||
// possible documents that we can return are the candidates.
|
||||
if self.words.len() <= 1 {
|
||||
if self.candidates.is_empty() { return None }
|
||||
return Some(Ok((0, mem::take(&mut self.candidates))));
|
||||
}
|
||||
|
||||
while self.mana <= self.max_mana {
|
||||
let mut answer = RoaringBitmap::new();
|
||||
let result = mdfs_step(
|
||||
&self.index,
|
||||
&self.rtxn,
|
||||
self.mana,
|
||||
self.words,
|
||||
&self.candidates,
|
||||
&self.candidates,
|
||||
&mut self.union_cache,
|
||||
&mut answer,
|
||||
);
|
||||
|
||||
match result {
|
||||
Ok(()) => {
|
||||
// We always increase the mana for the next loop.
|
||||
let proximity = self.mana;
|
||||
self.mana += 1;
|
||||
|
||||
// If no documents were found we must not return and continue
|
||||
// the search with more mana.
|
||||
if !answer.is_empty() {
|
||||
|
||||
// We remove the answered documents from the list of
|
||||
// candidates to be sure we don't search for them again.
|
||||
self.candidates.difference_with(&answer);
|
||||
|
||||
// We return the answer.
|
||||
return Some(Ok((proximity, answer)));
|
||||
}
|
||||
},
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn mdfs_step(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
mana: u32,
|
||||
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: &RoaringBitmap,
|
||||
parent_docids: &RoaringBitmap,
|
||||
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
|
||||
answer: &mut RoaringBitmap,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
use std::cmp::{min, max};
|
||||
|
||||
let (words1, words2) = (&words[0].0, &words[1].0);
|
||||
let pairs = words_pair_combinations(words1, words2);
|
||||
let tail = &words[1..];
|
||||
let nb_children = tail.len() as u32 - 1;
|
||||
|
||||
// The minimum amount of mana that you must consume is at least 1 and the
|
||||
// amount of mana that your children can consume. Because the last child must
|
||||
// consume the remaining mana, it is mandatory that there not too much at the end.
|
||||
let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8;
|
||||
|
||||
// The maximum amount of mana that you can use is 8 or the remaining amount of
|
||||
// mana minus your children, as you can't just consume all the mana,
|
||||
// your children must have at least 1 mana.
|
||||
let max_proximity = min(8, mana - nb_children) as u8;
|
||||
|
||||
for proximity in min_proximity..=max_proximity {
|
||||
let mut docids = match union_cache.entry((words.len(), proximity)) {
|
||||
Occupied(entry) => entry.get().clone(),
|
||||
Vacant(entry) => {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
if proximity == 8 {
|
||||
docids = candidates.clone();
|
||||
} else {
|
||||
for (w1, w2) in pairs.iter().cloned() {
|
||||
let key = (w1, w2, proximity);
|
||||
if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? {
|
||||
docids.union_with(&di);
|
||||
}
|
||||
}
|
||||
}
|
||||
entry.insert(docids).clone()
|
||||
}
|
||||
};
|
||||
|
||||
// We must be sure that we only return docids that are present in the candidates.
|
||||
docids.intersect_with(parent_docids);
|
||||
|
||||
if !docids.is_empty() {
|
||||
let mana = mana.checked_sub(proximity as u32).unwrap();
|
||||
if tail.len() < 2 {
|
||||
// We are the last pair, we return without recuring as we don't have any child.
|
||||
answer.union_with(&docids);
|
||||
return Ok(());
|
||||
} else {
|
||||
return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn words_pair_combinations<'h>(
|
||||
w1: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
w2: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
) -> Vec<(&'h str, &'h str)>
|
||||
{
|
||||
let mut pairs = Vec::new();
|
||||
for (w1, (_typos, docids1)) in w1 {
|
||||
for (w2, (_typos, docids2)) in w2 {
|
||||
if !docids1.is_disjoint(&docids2) {
|
||||
pairs.push((w1.as_str(), w2.as_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
pairs
|
||||
}
|
@ -1,217 +0,0 @@
|
||||
use meilisearch_tokenizer::{Token, TokenKind};
|
||||
|
||||
#[derive(Debug)]
|
||||
enum State {
|
||||
Free,
|
||||
Quoted,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn swap(&mut self) {
|
||||
match self {
|
||||
State::Quoted => *self = State::Free,
|
||||
State::Free => *self = State::Quoted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryToken<'a> {
|
||||
Free(Token<'a>),
|
||||
Quoted(Token<'a>),
|
||||
}
|
||||
|
||||
pub fn query_tokens<'a>(mut tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = QueryToken<'a>> {
|
||||
let mut state = State::Free;
|
||||
let f = move || {
|
||||
loop {
|
||||
let token = tokens.next()?;
|
||||
match token.kind() {
|
||||
_ if token.text().trim() == "\"" => state.swap(),
|
||||
TokenKind::Word => {
|
||||
let token = match state {
|
||||
State::Quoted => QueryToken::Quoted(token),
|
||||
State::Free => QueryToken::Free(token),
|
||||
};
|
||||
return Some(token);
|
||||
},
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
};
|
||||
std::iter::from_fn(f)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use QueryToken::{Quoted, Free};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use fst::Set;
|
||||
|
||||
macro_rules! assert_eq_query_token {
|
||||
($test:expr, Quoted($val:literal)) => {
|
||||
match $test {
|
||||
Quoted(val) => assert_eq!(val.text(), $val),
|
||||
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
|
||||
}
|
||||
};
|
||||
|
||||
($test:expr, Free($val:literal)) => {
|
||||
match $test {
|
||||
Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()),
|
||||
Free(val) => assert_eq!(val.text(), $val),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert!(iter.next().is_none());
|
||||
|
||||
let query = " ";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_quoted_string() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_pending_quoted_string() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_non_quoted_string() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quoted_directly_followed_by_free_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello\"world";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_directly_followed_by_quoted_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello\"world\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_followed_by_quoted_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello \"world\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spaces_separated_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello world ";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_interleaved_quoted_free_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello \"world\" coucou \"monde\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_quoted_strings() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello world\" coucou \"monde est beau\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("est"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("beau"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chinese() {
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "汽车男生";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("汽车"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("男生"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
}
|
282
milli/src/search/criteria/asc_desc.rs
Normal file
282
milli/src/search/criteria/asc_desc.rs
Normal file
@ -0,0 +1,282 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use anyhow::bail;
|
||||
use itertools::Itertools;
|
||||
use log::debug;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec};
|
||||
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
|
||||
use crate::search::facet::FacetIter;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::{FieldId, Index};
|
||||
use super::{Criterion, CriterionResult};
|
||||
|
||||
pub struct AscDesc<'t> {
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
ascending: bool,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: RoaringBitmap,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
faceted_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
}
|
||||
|
||||
impl<'t> AscDesc<'t> {
|
||||
pub fn initial_asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true)
|
||||
}
|
||||
|
||||
pub fn initial_desc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false)
|
||||
}
|
||||
|
||||
pub fn asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::new(index, rtxn, parent, field_id, facet_type, true)
|
||||
}
|
||||
|
||||
pub fn desc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::new(index, rtxn, parent, field_id, facet_type, false)
|
||||
}
|
||||
|
||||
fn initial(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
ascending: bool,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?;
|
||||
let candidates = match &query_tree {
|
||||
Some(qt) => {
|
||||
let context = CriteriaBuilder::new(rtxn, index)?;
|
||||
let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?;
|
||||
if let Some(candidates) = candidates {
|
||||
qt_candidates.intersect_with(&candidates);
|
||||
}
|
||||
qt_candidates
|
||||
},
|
||||
None => candidates.unwrap_or(faceted_candidates.clone()),
|
||||
};
|
||||
|
||||
Ok(AscDesc {
|
||||
index,
|
||||
rtxn,
|
||||
field_id,
|
||||
facet_type,
|
||||
ascending,
|
||||
query_tree,
|
||||
candidates,
|
||||
faceted_candidates,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn new(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
ascending: bool,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Ok(AscDesc {
|
||||
index,
|
||||
rtxn,
|
||||
field_id,
|
||||
facet_type,
|
||||
ascending,
|
||||
query_tree: None,
|
||||
candidates: RoaringBitmap::new(),
|
||||
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for AscDesc<'t> {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
|
||||
loop {
|
||||
debug!("Facet {} iteration ({:?})",
|
||||
if self.ascending { "Asc" } else { "Desc" }, self.candidates,
|
||||
);
|
||||
|
||||
match &mut self.candidates {
|
||||
candidates if candidates.is_empty() => {
|
||||
let query_tree = self.query_tree.take();
|
||||
let candidates = take(&mut self.candidates);
|
||||
let bucket_candidates = take(&mut self.bucket_candidates);
|
||||
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next()? {
|
||||
Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree;
|
||||
candidates.intersect_with(&self.faceted_candidates);
|
||||
self.candidates = candidates;
|
||||
self.bucket_candidates = bucket_candidates;
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => if query_tree.is_none() && bucket_candidates.is_empty() {
|
||||
return Ok(None)
|
||||
},
|
||||
}
|
||||
|
||||
return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates }));
|
||||
},
|
||||
candidates => {
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => candidates.clone(),
|
||||
};
|
||||
|
||||
let found_candidates = facet_ordered(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
self.field_id,
|
||||
self.facet_type,
|
||||
self.ascending,
|
||||
candidates.clone(),
|
||||
)?;
|
||||
|
||||
candidates.difference_with(&found_candidates);
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: found_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn facet_ordered(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
match facet_type {
|
||||
FacetType::Float => {
|
||||
if candidates.len() <= 1000 {
|
||||
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
||||
for docid in candidates.iter() {
|
||||
let left = (field_id, docid, f64::MIN);
|
||||
let right = (field_id, docid, f64::MAX);
|
||||
let mut iter = db.range(rtxn, &(left..=right))?;
|
||||
let entry = if ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, OrderedFloat(value)));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, value)| *value);
|
||||
let iter = docids_values.into_iter();
|
||||
let iter = if ascending {
|
||||
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
||||
} else {
|
||||
Box::new(iter.rev())
|
||||
};
|
||||
match iter.group_by(|(_, v)| *v).into_iter().next() {
|
||||
Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()),
|
||||
None => Ok(RoaringBitmap::new())
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
|
||||
};
|
||||
|
||||
let mut iter = facet_fn(rtxn, index, field_id, candidates)?;
|
||||
Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default())
|
||||
}
|
||||
},
|
||||
FacetType::Integer => {
|
||||
if candidates.len() <= 1000 {
|
||||
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetI64Codec>();
|
||||
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
||||
for docid in candidates.iter() {
|
||||
let left = (field_id, docid, i64::MIN);
|
||||
let right = (field_id, docid, i64::MAX);
|
||||
let mut iter = db.range(rtxn, &(left..=right))?;
|
||||
let entry = if ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, value));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, value)| *value);
|
||||
let iter = docids_values.into_iter();
|
||||
let iter = if ascending {
|
||||
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
||||
} else {
|
||||
Box::new(iter.rev())
|
||||
};
|
||||
match iter.group_by(|(_, v)| *v).into_iter().next() {
|
||||
Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()),
|
||||
None => Ok(RoaringBitmap::new())
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
|
||||
};
|
||||
|
||||
let mut iter = facet_fn(rtxn, index, field_id, candidates)?;
|
||||
Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default())
|
||||
}
|
||||
},
|
||||
FacetType::String => bail!("criteria facet type must be a number"),
|
||||
}
|
||||
}
|
113
milli/src/search/criteria/fetcher.rs
Normal file
113
milli/src/search/criteria/fetcher.rs
Normal file
@ -0,0 +1,113 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::Operation;
|
||||
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
|
||||
|
||||
pub struct Fetcher<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Candidates,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
should_get_documents_ids: bool,
|
||||
}
|
||||
|
||||
impl<'t> Fetcher<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Fetcher {
|
||||
ctx,
|
||||
query_tree,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
parent: None,
|
||||
should_get_documents_ids: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
) -> Self
|
||||
{
|
||||
Fetcher {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
candidates: Candidates::default(),
|
||||
parent: Some(parent),
|
||||
should_get_documents_ids: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Fetcher<'t> {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
|
||||
self.should_get_documents_ids, self.candidates,
|
||||
);
|
||||
|
||||
let should_get_documents_ids = take(&mut self.should_get_documents_ids);
|
||||
match &mut self.candidates {
|
||||
Allowed(_) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) if should_get_documents_ids => {
|
||||
let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?;
|
||||
docids.intersect_with(&candidates);
|
||||
docids
|
||||
},
|
||||
_ => candidates,
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
Forbidden(_) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next()? {
|
||||
Some(result) => return Ok(Some(result)),
|
||||
None => if should_get_documents_ids {
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
|
||||
None => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
}
|
||||
},
|
||||
None => if should_get_documents_ids {
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
|
||||
None => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
}
|
||||
return Ok(None);
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
483
milli/src/search/criteria/mod.rs
Normal file
483
milli/src/search/criteria/mod.rs
Normal file
@ -0,0 +1,483 @@
|
||||
use std::collections::HashMap;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use anyhow::{bail, Context as _};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::search::word_derivations;
|
||||
use crate::{Index, FieldId};
|
||||
|
||||
use super::query_tree::{Operation, Query, QueryKind};
|
||||
use self::typo::Typo;
|
||||
use self::words::Words;
|
||||
use self::asc_desc::AscDesc;
|
||||
use self::proximity::Proximity;
|
||||
use self::fetcher::Fetcher;
|
||||
|
||||
pub mod typo;
|
||||
pub mod words;
|
||||
pub mod asc_desc;
|
||||
pub mod proximity;
|
||||
pub mod fetcher;
|
||||
|
||||
pub trait Criterion {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>>;
|
||||
}
|
||||
|
||||
/// The result of a call to the parent criterion.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct CriterionResult {
|
||||
/// The query tree that must be used by the children criterion to fetch candidates.
|
||||
pub query_tree: Option<Operation>,
|
||||
/// The candidates that this criterion is allowed to return subsets of.
|
||||
pub candidates: RoaringBitmap,
|
||||
/// Candidates that comes from the current bucket of the initial criterion.
|
||||
pub bucket_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
/// Either a set of candidates that defines the candidates
|
||||
/// that are allowed to be returned,
|
||||
/// or the candidates that must never be returned.
|
||||
#[derive(Debug)]
|
||||
enum Candidates {
|
||||
Allowed(RoaringBitmap),
|
||||
Forbidden(RoaringBitmap)
|
||||
}
|
||||
|
||||
impl Candidates {
|
||||
fn into_inner(self) -> RoaringBitmap {
|
||||
match self {
|
||||
Self::Allowed(inner) => inner,
|
||||
Self::Forbidden(inner) => inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Candidates {
|
||||
fn default() -> Self {
|
||||
Self::Forbidden(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
pub trait Context {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
|
||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||
}
|
||||
pub struct CriteriaBuilder<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
index: &'t Index,
|
||||
words_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
}
|
||||
|
||||
impl<'a> Context for CriteriaBuilder<'a> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||
self.index.documents_ids(self.rtxn)
|
||||
}
|
||||
|
||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index.word_docids.get(self.rtxn, &word)
|
||||
}
|
||||
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index.word_prefix_docids.get(self.rtxn, &word)
|
||||
}
|
||||
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
|
||||
}
|
||||
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||
&self.words_fst
|
||||
}
|
||||
|
||||
fn in_prefix_cache(&self, word: &str) -> bool {
|
||||
self.words_prefixes_fst.contains(word)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> CriteriaBuilder<'t> {
|
||||
pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result<Self> {
|
||||
let words_fst = index.words_fst(rtxn)?;
|
||||
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
|
||||
Ok(Self { rtxn, index, words_fst, words_prefixes_fst })
|
||||
}
|
||||
|
||||
pub fn build(
|
||||
&'t self,
|
||||
mut query_tree: Option<Operation>,
|
||||
mut facet_candidates: Option<RoaringBitmap>,
|
||||
) -> anyhow::Result<Fetcher<'t>>
|
||||
{
|
||||
use crate::criterion::Criterion as Name;
|
||||
|
||||
let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?;
|
||||
let faceted_fields = self.index.faceted_fields(&self.rtxn)?;
|
||||
let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> {
|
||||
let id = fields_ids_map.id(field).with_context(|| {
|
||||
format!("field {:?} isn't registered", field)
|
||||
})?;
|
||||
let facet_type = faceted_fields.get(field).with_context(|| {
|
||||
format!("field {:?} isn't faceted", field)
|
||||
})?;
|
||||
Ok((id, *facet_type))
|
||||
};
|
||||
|
||||
let mut criterion = None as Option<Box<dyn Criterion>>;
|
||||
for name in self.index.criteria(&self.rtxn)? {
|
||||
criterion = Some(match criterion.take() {
|
||||
Some(father) => match name {
|
||||
Name::Typo => Box::new(Typo::new(self, father)),
|
||||
Name::Words => Box::new(Words::new(self, father)),
|
||||
Name::Proximity => Box::new(Proximity::new(self, father)),
|
||||
Name::Asc(field) => {
|
||||
let (id, facet_type) = field_id_facet_type(&field)?;
|
||||
Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?)
|
||||
},
|
||||
Name::Desc(field) => {
|
||||
let (id, facet_type) = field_id_facet_type(&field)?;
|
||||
Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?)
|
||||
},
|
||||
_otherwise => father,
|
||||
},
|
||||
None => match name {
|
||||
Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Asc(field) => {
|
||||
let (id, facet_type) = field_id_facet_type(&field)?;
|
||||
Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?)
|
||||
},
|
||||
Name::Desc(field) => {
|
||||
let (id, facet_type) = field_id_facet_type(&field)?;
|
||||
Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?)
|
||||
},
|
||||
_otherwise => continue,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
match criterion {
|
||||
Some(criterion) => Ok(Fetcher::new(self, criterion)),
|
||||
None => Ok(Fetcher::initial(self, query_tree, facet_candidates)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn resolve_query_tree<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
let mut ops = ops.iter().map(|op| {
|
||||
resolve_operation(ctx, op, cache)
|
||||
}).collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
ops.sort_unstable_by_key(|cds| cds.len());
|
||||
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for docids in ops {
|
||||
if first_loop {
|
||||
candidates = docids;
|
||||
first_loop = false;
|
||||
} else {
|
||||
candidates.intersect_with(&docids);
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Or(_, ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for op in ops {
|
||||
let docids = resolve_operation(ctx, op, cache)?;
|
||||
candidates.union_with(&docids);
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Query(q) => Ok(query_docids(ctx, q)?),
|
||||
}
|
||||
}
|
||||
|
||||
resolve_operation(ctx, query_tree, cache)
|
||||
}
|
||||
|
||||
|
||||
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||
ctx: &dyn Context,
|
||||
left_words: &[(T, u8)],
|
||||
right_words: &[(U, u8)],
|
||||
proximity: u8
|
||||
) -> anyhow::Result<RoaringBitmap> {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _l_typo) in left_words {
|
||||
for (right, _r_typo) in right_words {
|
||||
let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
}
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result<RoaringBitmap> {
|
||||
match &query.kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if query.prefix && ctx.in_prefix_cache(&word) {
|
||||
Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default())
|
||||
} else if query.prefix {
|
||||
let words = word_derivations(&word, true, 0, ctx.words_fst())?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (word, _typo) in words {
|
||||
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
Ok(docids)
|
||||
} else {
|
||||
Ok(ctx.word_docids(&word)?.unwrap_or_default())
|
||||
}
|
||||
},
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (word, _typo) in words {
|
||||
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
Ok(docids)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, proximity: u8) -> anyhow::Result<RoaringBitmap> {
|
||||
if proximity >= 8 {
|
||||
let mut candidates = query_docids(ctx, left)?;
|
||||
let right_candidates = query_docids(ctx, right)?;
|
||||
candidates.intersect_with(&right_candidates);
|
||||
return Ok(candidates);
|
||||
}
|
||||
|
||||
let prefix = right.prefix;
|
||||
match (&left.kind, &right.kind) {
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
|
||||
} else if prefix {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst())?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
} else {
|
||||
Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
|
||||
}
|
||||
},
|
||||
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
|
||||
let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?;
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _) in l_words {
|
||||
let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
Ok(docids)
|
||||
} else if prefix {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst())?;
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
} else {
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||
}
|
||||
},
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
|
||||
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
},
|
||||
(QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => {
|
||||
let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?;
|
||||
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?;
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
use maplit::hashmap;
|
||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn s(s: &str) -> String { s.to_string() }
|
||||
pub struct TestContext<'t> {
|
||||
words_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
word_docids: HashMap<String, RoaringBitmap>,
|
||||
word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'a> Context for TestContext<'a> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||
Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
|
||||
}
|
||||
|
||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
Ok(self.word_docids.get(&word.to_string()).cloned())
|
||||
}
|
||||
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
|
||||
}
|
||||
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
||||
Ok(self.word_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
||||
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||
&self.words_fst
|
||||
}
|
||||
|
||||
fn in_prefix_cache(&self, word: &str) -> bool {
|
||||
self.word_prefix_docids.contains_key(&word.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Default for TestContext<'a> {
|
||||
fn default() -> TestContext<'a> {
|
||||
let mut rng = StdRng::seed_from_u64(102);
|
||||
let rng = &mut rng;
|
||||
|
||||
fn random_postings<R: Rng>(rng: &mut R, len: usize) -> RoaringBitmap {
|
||||
let mut values = Vec::<u32>::with_capacity(len);
|
||||
while values.len() != len {
|
||||
values.push(rng.gen());
|
||||
}
|
||||
values.sort_unstable();
|
||||
|
||||
RoaringBitmap::from_sorted_iter(values.into_iter())
|
||||
}
|
||||
|
||||
let word_docids = hashmap!{
|
||||
s("hello") => random_postings(rng, 1500),
|
||||
s("hi") => random_postings(rng, 4000),
|
||||
s("word") => random_postings(rng, 2500),
|
||||
s("split") => random_postings(rng, 400),
|
||||
s("ngrams") => random_postings(rng, 1400),
|
||||
s("world") => random_postings(rng, 15_000),
|
||||
s("earth") => random_postings(rng, 8000),
|
||||
s("2021") => random_postings(rng, 100),
|
||||
s("2020") => random_postings(rng, 500),
|
||||
s("is") => random_postings(rng, 50_000),
|
||||
s("this") => random_postings(rng, 50_000),
|
||||
s("good") => random_postings(rng, 1250),
|
||||
s("morning") => random_postings(rng, 125),
|
||||
};
|
||||
|
||||
let word_prefix_docids = hashmap!{
|
||||
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
|
||||
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
|
||||
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
|
||||
};
|
||||
|
||||
let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")];
|
||||
let hello_world_split = (hello_world.len() / 2) as usize;
|
||||
let hello_world_1 = hello_world.iter().take(hello_world_split).collect();
|
||||
let hello_world_2 = hello_world.iter().skip(hello_world_split).collect();
|
||||
|
||||
let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")];
|
||||
let hello_word_split = (hello_word.len() / 2) as usize;
|
||||
let hello_word_4 = hello_word.iter().take(hello_word_split).collect();
|
||||
let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect();
|
||||
let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect();
|
||||
let word_pair_proximity_docids = hashmap!{
|
||||
(s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")],
|
||||
(s("hello"), s("world"), 1) => hello_world_1,
|
||||
(s("hello"), s("world"), 4) => hello_world_2,
|
||||
(s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")],
|
||||
(s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
|
||||
(s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
|
||||
(s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
|
||||
(s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
|
||||
(s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")],
|
||||
(s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
|
||||
(s("hello"), s("word"), 4) => hello_word_4,
|
||||
(s("hello"), s("word"), 6) => hello_word_6,
|
||||
(s("hello"), s("word"), 7) => hello_word_7,
|
||||
(s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
|
||||
(s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
|
||||
(s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
|
||||
(s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
|
||||
};
|
||||
|
||||
let word_prefix_pair_proximity_docids = hashmap!{
|
||||
(s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(),
|
||||
(s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(),
|
||||
(s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(),
|
||||
(s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(),
|
||||
(s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(),
|
||||
(s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(),
|
||||
};
|
||||
|
||||
let mut keys = word_docids.keys().collect::<Vec<_>>();
|
||||
keys.sort_unstable();
|
||||
let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
|
||||
|
||||
TestContext {
|
||||
words_fst,
|
||||
word_docids,
|
||||
word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
291
milli/src/search/criteria/proximity.rs
Normal file
291
milli/src/search/criteria/proximity.rs
Normal file
@ -0,0 +1,291 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use log::debug;
|
||||
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
|
||||
|
||||
pub struct Proximity<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<(usize, Operation)>,
|
||||
proximity: u8,
|
||||
candidates: Candidates,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
||||
}
|
||||
|
||||
impl<'t> Proximity<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Proximity {
|
||||
ctx,
|
||||
query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
|
||||
proximity: 0,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Proximity {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
proximity: 0,
|
||||
candidates: Candidates::default(),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
candidates_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Proximity<'t> {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Proximity at iteration {} (max {:?}) ({:?})",
|
||||
self.proximity,
|
||||
self.query_tree.as_ref().map(|(mp, _)| mp),
|
||||
self.candidates,
|
||||
);
|
||||
|
||||
match (&mut self.query_tree, &mut self.candidates) {
|
||||
(_, Allowed(candidates)) if candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take().map(|(_, qt)| qt),
|
||||
candidates: take(&mut self.candidates).into_inner(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some((max_prox, query_tree)), Allowed(candidates)) => {
|
||||
if self.proximity as usize > *max_prox {
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
} else {
|
||||
let mut new_candidates = resolve_candidates(
|
||||
self.ctx,
|
||||
&query_tree,
|
||||
self.proximity,
|
||||
&mut self.candidates_cache,
|
||||
)?;
|
||||
|
||||
new_candidates.intersect_with(&candidates);
|
||||
candidates.difference_with(&new_candidates);
|
||||
self.proximity += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree.clone()),
|
||||
candidates: new_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
},
|
||||
(Some((max_prox, query_tree)), Forbidden(candidates)) => {
|
||||
if self.proximity as usize > *max_prox {
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
} else {
|
||||
let mut new_candidates = resolve_candidates(
|
||||
self.ctx,
|
||||
&query_tree,
|
||||
self.proximity,
|
||||
&mut self.candidates_cache,
|
||||
)?;
|
||||
|
||||
new_candidates.difference_with(&candidates);
|
||||
candidates.union_with(&new_candidates);
|
||||
self.proximity += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree.clone()),
|
||||
candidates: new_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
},
|
||||
(None, Allowed(_)) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
(None, Forbidden(_)) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next()? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
|
||||
self.proximity = 0;
|
||||
self.candidates = Candidates::Allowed(candidates);
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_candidates<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
||||
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, proximity, cache)?,
|
||||
Consecutive(ops) => if proximity == 0 {
|
||||
mdfs(ctx, ops, 0, cache)?
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
Or(_, ops) => {
|
||||
let mut output = Vec::new();
|
||||
for op in ops {
|
||||
let result = resolve_operation(ctx, op, proximity, cache)?;
|
||||
output.extend(result);
|
||||
}
|
||||
output
|
||||
},
|
||||
Query(q) => if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn mdfs_pair<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
left: &Operation,
|
||||
right: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
||||
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
|
||||
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
|
||||
}
|
||||
|
||||
let pair_max_proximity = 7;
|
||||
|
||||
let mut output = Vec::new();
|
||||
|
||||
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
|
||||
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
|
||||
let left_key = (left.clone(), left_p);
|
||||
if !cache.contains_key(&left_key) {
|
||||
let candidates = resolve_operation(ctx, left, left_p, cache)?;
|
||||
cache.insert(left_key.clone(), candidates);
|
||||
}
|
||||
|
||||
let right_key = (right.clone(), right_p);
|
||||
if !cache.contains_key(&right_key) {
|
||||
let candidates = resolve_operation(ctx, right, right_p, cache)?;
|
||||
cache.insert(right_key.clone(), candidates);
|
||||
}
|
||||
|
||||
let lefts = cache.get(&left_key).unwrap();
|
||||
let rights = cache.get(&right_key).unwrap();
|
||||
|
||||
for (ll, lr, lcandidates) in lefts {
|
||||
for (rl, rr, rcandidates) in rights {
|
||||
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?;
|
||||
if lcandidates.len() < rcandidates.len() {
|
||||
candidates.intersect_with(lcandidates);
|
||||
candidates.intersect_with(rcandidates);
|
||||
} else {
|
||||
candidates.intersect_with(rcandidates);
|
||||
candidates.intersect_with(lcandidates);
|
||||
}
|
||||
if !candidates.is_empty() {
|
||||
output.push((ll.clone(), rr.clone(), candidates));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn mdfs<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
branches: &[Operation],
|
||||
proximity: u8,
|
||||
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
||||
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
// Extract the first two elements but gives the tail
|
||||
// that is just after the first element.
|
||||
let next = branches.split_first().map(|(h1, t)| {
|
||||
(h1, t.split_first().map(|(h2, _)| (h2, t)))
|
||||
});
|
||||
|
||||
match next {
|
||||
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache),
|
||||
Some((head1, Some((head2, tail)))) => {
|
||||
let mut output = Vec::new();
|
||||
for p in 0..=proximity {
|
||||
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? {
|
||||
if !head_candidates.is_empty() {
|
||||
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? {
|
||||
candidates.intersect_with(&head_candidates);
|
||||
if !candidates.is_empty() {
|
||||
output.push((lhead.clone(), rtail, candidates));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(output)
|
||||
},
|
||||
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache),
|
||||
None => return Ok(Default::default()),
|
||||
}
|
||||
}
|
||||
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? {
|
||||
candidates.union_with(&cds);
|
||||
}
|
||||
Ok(candidates)
|
||||
}
|
482
milli/src/search/criteria/typo.rs
Normal file
482
milli/src/search/criteria/typo.rs
Normal file
@ -0,0 +1,482 @@
|
||||
use std::{borrow::Cow, collections::HashMap, mem::take};
|
||||
|
||||
use anyhow::bail;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
|
||||
use crate::search::word_derivations;
|
||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
|
||||
|
||||
pub struct Typo<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<(usize, Operation)>,
|
||||
number_typos: u8,
|
||||
candidates: Candidates,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
|
||||
typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>,
|
||||
}
|
||||
|
||||
impl<'t> Typo<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Typo {
|
||||
ctx,
|
||||
query_tree: query_tree.map(|op| (maximum_typo(&op), op)),
|
||||
number_typos: 0,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: HashMap::new(),
|
||||
typo_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Typo {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
number_typos: 0,
|
||||
candidates: Candidates::default(),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
candidates_cache: HashMap::new(),
|
||||
typo_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Typo<'t> {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates);
|
||||
|
||||
match (&mut self.query_tree, &mut self.candidates) {
|
||||
(_, Allowed(candidates)) if candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take().map(|(_, qt)| qt),
|
||||
candidates: take(&mut self.candidates).into_inner(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some((max_typos, query_tree)), Allowed(candidates)) => {
|
||||
if self.number_typos as usize > *max_typos {
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
} else {
|
||||
let fst = self.ctx.words_fst();
|
||||
let new_query_tree = if self.number_typos < 2 {
|
||||
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?
|
||||
} else if self.number_typos == 2 {
|
||||
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?;
|
||||
query_tree.clone()
|
||||
} else {
|
||||
query_tree.clone()
|
||||
};
|
||||
|
||||
let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?;
|
||||
new_candidates.intersect_with(&candidates);
|
||||
candidates.difference_with(&new_candidates);
|
||||
self.number_typos += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(new_query_tree),
|
||||
candidates: new_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
},
|
||||
(Some((max_typos, query_tree)), Forbidden(candidates)) => {
|
||||
if self.number_typos as usize > *max_typos {
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
} else {
|
||||
let fst = self.ctx.words_fst();
|
||||
let new_query_tree = if self.number_typos < 2 {
|
||||
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?
|
||||
} else if self.number_typos == 2 {
|
||||
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?;
|
||||
query_tree.clone()
|
||||
} else {
|
||||
query_tree.clone()
|
||||
};
|
||||
|
||||
let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?;
|
||||
new_candidates.difference_with(&candidates);
|
||||
candidates.union_with(&new_candidates);
|
||||
self.number_typos += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(new_query_tree),
|
||||
candidates: new_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
},
|
||||
(None, Allowed(_)) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
(None, Forbidden(_)) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next()? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
|
||||
self.number_typos = 0;
|
||||
self.candidates = Candidates::Allowed(candidates);
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Modify the query tree by replacing every tolerant query by an Or operation
|
||||
/// containing all of the corresponding exact words in the words FST. Each tolerant
|
||||
/// query will only be replaced by exact query with up to `number_typos` maximum typos.
|
||||
fn alterate_query_tree(
|
||||
words_fst: &fst::Set<Cow<[u8]>>,
|
||||
mut query_tree: Operation,
|
||||
number_typos: u8,
|
||||
typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>,
|
||||
) -> anyhow::Result<Operation>
|
||||
{
|
||||
fn recurse(
|
||||
words_fst: &fst::Set<Cow<[u8]>>,
|
||||
operation: &mut Operation,
|
||||
number_typos: u8,
|
||||
typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or};
|
||||
|
||||
match operation {
|
||||
And(ops) | Consecutive(ops) | Or(_, ops) => {
|
||||
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache))
|
||||
},
|
||||
Operation::Query(q) => {
|
||||
// TODO may be optimized when number_typos == 0
|
||||
if let QueryKind::Tolerant { typo, word } = &q.kind {
|
||||
// if no typo is allowed we don't call word_derivations function,
|
||||
// and directly create an Exact query
|
||||
if number_typos == 0 {
|
||||
*operation = Operation::Query(Query {
|
||||
prefix: q.prefix,
|
||||
kind: QueryKind::Exact { original_typo: 0, word: word.clone() },
|
||||
});
|
||||
} else {
|
||||
let typo = *typo.min(&number_typos);
|
||||
let cache_key = (word.clone(), q.prefix, typo);
|
||||
let words = if let Some(derivations) = typo_cache.get(&cache_key) {
|
||||
derivations.clone()
|
||||
} else {
|
||||
let derivations = word_derivations(word, q.prefix, typo, words_fst)?;
|
||||
typo_cache.insert(cache_key, derivations.clone());
|
||||
derivations
|
||||
};
|
||||
|
||||
let queries = words.into_iter().map(|(word, typo)| {
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::Exact { original_typo: typo, word },
|
||||
})
|
||||
}).collect();
|
||||
|
||||
*operation = Operation::or(false, queries);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
recurse(words_fst, &mut query_tree, number_typos, typo_cache)?;
|
||||
Ok(query_tree)
|
||||
}
|
||||
|
||||
fn resolve_candidates<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
mdfs(ctx, ops, number_typos, cache)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Or(_, ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for op in ops {
|
||||
let docids = resolve_operation(ctx, op, number_typos, cache)?;
|
||||
candidates.union_with(&docids);
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Query(q) => if q.kind.typo() == number_typos {
|
||||
Ok(query_docids(ctx, q)?)
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn mdfs<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
branches: &[Operation],
|
||||
mana: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
match branches.split_first() {
|
||||
Some((head, [])) => {
|
||||
let cache_key = (head.clone(), mana);
|
||||
if let Some(candidates) = cache.get(&cache_key) {
|
||||
Ok(candidates.clone())
|
||||
} else {
|
||||
let candidates = resolve_operation(ctx, head, mana, cache)?;
|
||||
cache.insert(cache_key, candidates.clone());
|
||||
Ok(candidates)
|
||||
}
|
||||
},
|
||||
Some((head, tail)) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
|
||||
for m in 0..=mana {
|
||||
let mut head_candidates = {
|
||||
let cache_key = (head.clone(), m);
|
||||
if let Some(candidates) = cache.get(&cache_key) {
|
||||
candidates.clone()
|
||||
} else {
|
||||
let candidates = resolve_operation(ctx, head, m, cache)?;
|
||||
cache.insert(cache_key, candidates.clone());
|
||||
candidates
|
||||
}
|
||||
};
|
||||
if !head_candidates.is_empty() {
|
||||
let tail_candidates = mdfs(ctx, tail, mana - m, cache)?;
|
||||
head_candidates.intersect_with(&tail_candidates);
|
||||
candidates.union_with(&head_candidates);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
},
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
resolve_operation(ctx, query_tree, number_typos, cache)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
use super::super::test::TestContext;
|
||||
|
||||
#[test]
|
||||
fn initial_placeholder_no_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = None;
|
||||
let facet_candidates = None;
|
||||
|
||||
let mut criteria = Typo::initial(&context, query_tree, facet_candidates);
|
||||
|
||||
assert!(criteria.next().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_query_tree_no_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
|
||||
])
|
||||
]);
|
||||
|
||||
let facet_candidates = None;
|
||||
|
||||
let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates);
|
||||
|
||||
let candidates_1 = context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("world").unwrap().unwrap();
|
||||
let expected_1 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
])),
|
||||
candidates: candidates_1.clone(),
|
||||
bucket_candidates: candidates_1,
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next().unwrap(), Some(expected_1));
|
||||
|
||||
let candidates_2 = (
|
||||
context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap()
|
||||
) - context.word_docids("world").unwrap().unwrap();
|
||||
let expected_2 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
]),
|
||||
])),
|
||||
candidates: candidates_2.clone(),
|
||||
bucket_candidates: candidates_2,
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next().unwrap(), Some(expected_2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_placeholder_with_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = None;
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone()));
|
||||
|
||||
let expected = CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: facet_candidates.clone(),
|
||||
bucket_candidates: facet_candidates,
|
||||
};
|
||||
|
||||
// first iteration, returns the facet candidates
|
||||
assert_eq!(criteria.next().unwrap(), Some(expected));
|
||||
|
||||
// second iteration, returns None because there is no more things to do
|
||||
assert!(criteria.next().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_query_tree_with_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
|
||||
])
|
||||
]);
|
||||
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone()));
|
||||
|
||||
let candidates_1 = context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("world").unwrap().unwrap();
|
||||
let expected_1 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
])),
|
||||
candidates: &candidates_1 & &facet_candidates,
|
||||
bucket_candidates: candidates_1 & &facet_candidates,
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next().unwrap(), Some(expected_1));
|
||||
|
||||
let candidates_2 = (
|
||||
context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap()
|
||||
) - context.word_docids("world").unwrap().unwrap();
|
||||
let expected_2 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
]),
|
||||
])),
|
||||
candidates: &candidates_2 & &facet_candidates,
|
||||
bucket_candidates: candidates_2 & &facet_candidates,
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next().unwrap(), Some(expected_2));
|
||||
}
|
||||
|
||||
}
|
128
milli/src/search/criteria/words.rs
Normal file
128
milli/src/search/criteria/words.rs
Normal file
@ -0,0 +1,128 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::Operation;
|
||||
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
|
||||
|
||||
pub struct Words<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_trees: Vec<Operation>,
|
||||
candidates: Candidates,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'t> Words<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Words {
|
||||
ctx,
|
||||
query_trees: query_tree.map(explode_query_tree).unwrap_or_default(),
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: HashMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Words {
|
||||
ctx,
|
||||
query_trees: Vec::default(),
|
||||
candidates: Candidates::default(),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
candidates_cache: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Words<'t> {
|
||||
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
|
||||
|
||||
match (self.query_trees.pop(), &mut self.candidates) {
|
||||
(query_tree, Allowed(candidates)) if candidates.is_empty() => {
|
||||
self.query_trees = Vec::new();
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates: take(&mut self.candidates).into_inner(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some(qt), Allowed(candidates)) => {
|
||||
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
|
||||
found_candidates.intersect_with(&candidates);
|
||||
candidates.difference_with(&found_candidates);
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => found_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(qt),
|
||||
candidates: found_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
(Some(qt), Forbidden(candidates)) => {
|
||||
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
|
||||
found_candidates.difference_with(&candidates);
|
||||
candidates.union_with(&found_candidates);
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => found_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(qt),
|
||||
candidates: found_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
(None, Allowed(_)) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
(None, Forbidden(_)) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next()? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
|
||||
self.candidates = Candidates::Allowed(candidates);
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn explode_query_tree(query_tree: Operation) -> Vec<Operation> {
|
||||
match query_tree {
|
||||
Operation::Or(true, ops) => ops,
|
||||
otherwise => vec![otherwise],
|
||||
}
|
||||
}
|
@ -1,27 +1,21 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fst::{IntoStreamer, Streamer, Set};
|
||||
use levenshtein_automata::DFA;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
||||
use once_cell::sync::Lazy;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||
use crate::mdfs::Mdfs;
|
||||
use crate::query_tokens::{query_tokens, QueryToken};
|
||||
use crate::{Index, FieldId, DocumentId, Criterion};
|
||||
use crate::search::criteria::{Criterion, CriterionResult};
|
||||
use crate::{Index, DocumentId};
|
||||
|
||||
pub use self::facet::FacetIter;
|
||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::facet::{FacetIter};
|
||||
pub use self::query_tree::MatchingWords;
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
|
||||
// Building these factories is not free.
|
||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
@ -30,6 +24,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
|
||||
mod facet;
|
||||
mod query_tree;
|
||||
mod criteria;
|
||||
|
||||
pub struct Search<'a> {
|
||||
query: Option<String>,
|
||||
@ -65,208 +60,23 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
/// Extracts the query words from the query string and returns the DFAs accordingly.
|
||||
/// TODO introduce settings for the number of typos regarding the words lengths.
|
||||
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
|
||||
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
|
||||
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let words: Vec<_> = query_tokens(tokens).collect();
|
||||
|
||||
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let number_of_words = words.len();
|
||||
|
||||
words.into_iter().enumerate().map(|(i, word)| {
|
||||
let (word, quoted) = match word {
|
||||
QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3),
|
||||
QueryToken::Quoted(token) => (token.text().to_string(), true),
|
||||
};
|
||||
let is_last = i + 1 == number_of_words;
|
||||
let is_prefix = is_last && !ends_with_whitespace && !quoted;
|
||||
let lev = match word.len() {
|
||||
0..=4 => if quoted { lev0 } else { lev0 },
|
||||
5..=8 => if quoted { lev0 } else { lev1 },
|
||||
_ => if quoted { lev0 } else { lev2 },
|
||||
};
|
||||
|
||||
let dfa = if is_prefix {
|
||||
lev.build_prefix_dfa(&word)
|
||||
} else {
|
||||
lev.build_dfa(&word)
|
||||
};
|
||||
|
||||
(word, is_prefix, dfa)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Fetch the words from the given FST related to the given DFAs along with
|
||||
/// the associated documents ids.
|
||||
fn fetch_words_docids(
|
||||
&self,
|
||||
fst: &fst::Set<Cow<[u8]>>,
|
||||
dfas: Vec<(String, bool, DFA)>,
|
||||
) -> anyhow::Result<Vec<(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)>>
|
||||
{
|
||||
// A Vec storing all the derived words from the original query words, associated
|
||||
// with the distance from the original word and the docids where the words appears.
|
||||
let mut derived_words = Vec::<(HashMap::<String, (u8, RoaringBitmap)>, RoaringBitmap)>::with_capacity(dfas.len());
|
||||
|
||||
for (_word, _is_prefix, dfa) in dfas {
|
||||
|
||||
let mut acc_derived_words = HashMap::new();
|
||||
let mut unions_docids = RoaringBitmap::new();
|
||||
let mut stream = fst.search_with_state(&dfa).into_stream();
|
||||
while let Some((word, state)) = stream.next() {
|
||||
|
||||
let word = std::str::from_utf8(word)?;
|
||||
let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap();
|
||||
let distance = dfa.distance(state);
|
||||
unions_docids.union_with(&docids);
|
||||
acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids));
|
||||
}
|
||||
derived_words.push((acc_derived_words, unions_docids));
|
||||
}
|
||||
|
||||
Ok(derived_words)
|
||||
}
|
||||
|
||||
/// Returns the set of docids that contains all of the query words.
|
||||
fn compute_candidates(
|
||||
derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
) -> RoaringBitmap
|
||||
{
|
||||
// We sort the derived words by inverse popularity, this way intersections are faster.
|
||||
let mut derived_words: Vec<_> = derived_words.iter().collect();
|
||||
derived_words.sort_unstable_by_key(|(_, docids)| docids.len());
|
||||
|
||||
// we do a union between all the docids of each of the derived words,
|
||||
// we got N unions (the number of original query words), we then intersect them.
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
|
||||
for (i, (_, union_docids)) in derived_words.iter().enumerate() {
|
||||
if i == 0 {
|
||||
candidates = union_docids.clone();
|
||||
} else {
|
||||
candidates.intersect_with(&union_docids);
|
||||
}
|
||||
}
|
||||
|
||||
candidates
|
||||
}
|
||||
|
||||
fn facet_ordered(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
ascending: bool,
|
||||
mut documents_ids: RoaringBitmap,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<Vec<DocumentId>>
|
||||
{
|
||||
let mut output: Vec<_> = match facet_type {
|
||||
FacetType::Float => {
|
||||
if documents_ids.len() <= 1000 {
|
||||
let db = self.index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
let mut docids_values = Vec::with_capacity(documents_ids.len() as usize);
|
||||
for docid in documents_ids.iter() {
|
||||
let left = (field_id, docid, f64::MIN);
|
||||
let right = (field_id, docid, f64::MAX);
|
||||
let mut iter = db.range(self.rtxn, &(left..=right))?;
|
||||
let entry = if ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, OrderedFloat(value)));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, value)| *value);
|
||||
let iter = docids_values.into_iter().map(|(id, _)| id);
|
||||
if ascending {
|
||||
iter.take(limit).collect()
|
||||
} else {
|
||||
iter.rev().take(limit).collect()
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
|
||||
};
|
||||
let mut limit_tmp = limit;
|
||||
let mut output = Vec::new();
|
||||
for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? {
|
||||
let (_val, docids) = result?;
|
||||
limit_tmp = limit_tmp.saturating_sub(docids.len() as usize);
|
||||
output.push(docids);
|
||||
if limit_tmp == 0 { break }
|
||||
}
|
||||
output.into_iter().flatten().take(limit).collect()
|
||||
}
|
||||
},
|
||||
FacetType::Integer => {
|
||||
if documents_ids.len() <= 1000 {
|
||||
let db = self.index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetI64Codec>();
|
||||
let mut docids_values = Vec::with_capacity(documents_ids.len() as usize);
|
||||
for docid in documents_ids.iter() {
|
||||
let left = (field_id, docid, i64::MIN);
|
||||
let right = (field_id, docid, i64::MAX);
|
||||
let mut iter = db.range(self.rtxn, &(left..=right))?;
|
||||
let entry = if ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, value));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, value)| *value);
|
||||
let iter = docids_values.into_iter().map(|(id, _)| id);
|
||||
if ascending {
|
||||
iter.take(limit).collect()
|
||||
} else {
|
||||
iter.rev().take(limit).collect()
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
|
||||
};
|
||||
let mut limit_tmp = limit;
|
||||
let mut output = Vec::new();
|
||||
for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? {
|
||||
let (_val, docids) = result?;
|
||||
limit_tmp = limit_tmp.saturating_sub(docids.len() as usize);
|
||||
output.push(docids);
|
||||
if limit_tmp == 0 { break }
|
||||
}
|
||||
output.into_iter().flatten().take(limit).collect()
|
||||
}
|
||||
},
|
||||
FacetType::String => bail!("criteria facet type must be a number"),
|
||||
};
|
||||
|
||||
// if there isn't enough documents to return we try to complete that list
|
||||
// with documents that are maybe not faceted under this field and therefore
|
||||
// not returned by the previous facet iteration.
|
||||
if output.len() < limit {
|
||||
output.iter().for_each(|n| { documents_ids.remove(*n); });
|
||||
let remaining = documents_ids.iter().take(limit - output.len());
|
||||
output.extend(remaining);
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
||||
let limit = self.limit;
|
||||
let fst = self.index.words_fst(self.rtxn)?;
|
||||
|
||||
// Construct the DFAs related to the query words.
|
||||
let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) {
|
||||
Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?),
|
||||
_otherwise => None,
|
||||
// We create the query tree by spliting the query into tokens.
|
||||
let before = Instant::now();
|
||||
let query_tree = match self.query.as_ref() {
|
||||
Some(query) => {
|
||||
let builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
||||
let stop_words = &Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
||||
let result = analyzer.analyze(query);
|
||||
let tokens = result.tokens();
|
||||
builder.build(tokens)?
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
|
||||
debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
|
||||
|
||||
// We create the original candidates with the facet conditions results.
|
||||
let before = Instant::now();
|
||||
let facet_candidates = match &self.facet_condition {
|
||||
@ -276,100 +86,42 @@ impl<'a> Search<'a> {
|
||||
|
||||
debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
|
||||
|
||||
let order_by_facet = {
|
||||
let criteria = self.index.criteria(self.rtxn)?;
|
||||
let result = criteria.into_iter().flat_map(|criterion| {
|
||||
match criterion {
|
||||
Criterion::Asc(fid) => Some((fid, true)),
|
||||
Criterion::Desc(fid) => Some((fid, false)),
|
||||
_ => None
|
||||
}
|
||||
}).next();
|
||||
match result {
|
||||
Some((attr_name, is_ascending)) => {
|
||||
let field_id_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?;
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?;
|
||||
let ftype = *faceted_fields.get(&fid)
|
||||
.with_context(|| format!("{:?} not found in the faceted fields.", attr_name))
|
||||
.expect("corrupted data: ");
|
||||
Some((fid, ftype, is_ascending))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
let matching_words = match query_tree.as_ref() {
|
||||
Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
let before = Instant::now();
|
||||
let (candidates, derived_words) = match (facet_candidates, derived_words) {
|
||||
(Some(mut facet_candidates), Some(derived_words)) => {
|
||||
let words_candidates = Self::compute_candidates(&derived_words);
|
||||
facet_candidates.intersect_with(&words_candidates);
|
||||
(facet_candidates, derived_words)
|
||||
},
|
||||
(None, Some(derived_words)) => {
|
||||
(Self::compute_candidates(&derived_words), derived_words)
|
||||
},
|
||||
(Some(facet_candidates), None) => {
|
||||
// If the query is not set or results in no DFAs but
|
||||
// there is some facet conditions we return a placeholder.
|
||||
let documents_ids = match order_by_facet {
|
||||
Some((fid, ftype, is_ascending)) => {
|
||||
self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
|
||||
},
|
||||
None => facet_candidates.iter().take(limit).collect(),
|
||||
};
|
||||
return Ok(SearchResult {
|
||||
documents_ids,
|
||||
candidates: facet_candidates,
|
||||
..Default::default()
|
||||
})
|
||||
},
|
||||
(None, None) => {
|
||||
// If the query is not set or results in no DFAs we return a placeholder.
|
||||
let all_docids = self.index.documents_ids(self.rtxn)?;
|
||||
let documents_ids = match order_by_facet {
|
||||
Some((fid, ftype, is_ascending)) => {
|
||||
self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
|
||||
},
|
||||
None => all_docids.iter().take(limit).collect(),
|
||||
};
|
||||
return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
|
||||
},
|
||||
};
|
||||
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
|
||||
let mut criteria = criteria_builder.build(query_tree, facet_candidates)?;
|
||||
|
||||
debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed());
|
||||
let mut offset = self.offset;
|
||||
let mut limit = self.limit;
|
||||
let mut documents_ids = Vec::new();
|
||||
let mut initial_candidates = RoaringBitmap::new();
|
||||
while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
||||
|
||||
// The mana depth first search is a revised DFS that explore
|
||||
// solutions in the order of their proximities.
|
||||
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
|
||||
let mut documents = Vec::new();
|
||||
debug!("Number of candidates found {}", candidates.len());
|
||||
|
||||
// We execute the Mdfs iterator until we find enough documents.
|
||||
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
|
||||
match mdfs.next().transpose()? {
|
||||
Some((proximity, answer)) => {
|
||||
debug!("answer with a proximity of {}: {:?}", proximity, answer);
|
||||
documents.push(answer);
|
||||
},
|
||||
None => break,
|
||||
let mut len = candidates.len() as usize;
|
||||
let mut candidates = candidates.into_iter();
|
||||
|
||||
initial_candidates.union_with(&bucket_candidates);
|
||||
|
||||
if offset != 0 {
|
||||
candidates.by_ref().skip(offset).for_each(drop);
|
||||
offset = offset.saturating_sub(len.min(offset));
|
||||
len = len.saturating_sub(len.min(offset));
|
||||
}
|
||||
|
||||
if len != 0 {
|
||||
documents_ids.extend(candidates.take(limit));
|
||||
limit = limit.saturating_sub(len.min(limit));
|
||||
}
|
||||
|
||||
if limit == 0 { break }
|
||||
}
|
||||
|
||||
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
|
||||
let documents_ids = match order_by_facet {
|
||||
Some((fid, ftype, order)) => {
|
||||
let mut ordered_documents = Vec::new();
|
||||
for documents_ids in documents {
|
||||
let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?;
|
||||
ordered_documents.push(docids);
|
||||
if ordered_documents.iter().map(Vec::len).sum::<usize>() >= limit { break }
|
||||
}
|
||||
ordered_documents.into_iter().flatten().take(limit).collect()
|
||||
},
|
||||
None => documents.into_iter().flatten().take(limit).collect(),
|
||||
};
|
||||
|
||||
Ok(SearchResult { found_words, candidates, documents_ids })
|
||||
Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
|
||||
}
|
||||
}
|
||||
|
||||
@ -387,28 +139,21 @@ impl fmt::Debug for Search<'_> {
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SearchResult {
|
||||
pub found_words: HashSet<String>,
|
||||
pub matching_words: MatchingWords,
|
||||
pub candidates: RoaringBitmap,
|
||||
// TODO those documents ids should be associated with their criteria scores.
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
|
||||
pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> {
|
||||
let dfa = {
|
||||
let lev = match max_typo {
|
||||
0 => &LEVDIST0,
|
||||
1 => &LEVDIST1,
|
||||
_ => &LEVDIST2,
|
||||
};
|
||||
|
||||
if is_prefix {
|
||||
lev.build_prefix_dfa(&word)
|
||||
} else {
|
||||
lev.build_dfa(&word)
|
||||
}
|
||||
};
|
||||
|
||||
pub fn word_derivations(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
fst: &fst::Set<Cow<[u8]>>,
|
||||
) -> anyhow::Result<Vec<(String, u8)>>
|
||||
{
|
||||
let mut derived_words = Vec::new();
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
let mut stream = fst.search_with_state(&dfa).into_stream();
|
||||
|
||||
while let Some((word, state)) = stream.next() {
|
||||
@ -419,3 +164,17 @@ pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<
|
||||
|
||||
Ok(derived_words)
|
||||
}
|
||||
|
||||
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
|
||||
let lev = match typos {
|
||||
0 => &LEVDIST0,
|
||||
1 => &LEVDIST1,
|
||||
_ => &LEVDIST2,
|
||||
};
|
||||
|
||||
if is_prefix {
|
||||
lev.build_prefix_dfa(word)
|
||||
} else {
|
||||
lev.build_dfa(word)
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,13 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::{fmt, cmp, mem};
|
||||
|
||||
use levenshtein_automata::{DFA, Distance};
|
||||
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
||||
use roaring::RoaringBitmap;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::Index;
|
||||
use super::build_dfa;
|
||||
|
||||
type IsOptionalWord = bool;
|
||||
type IsPrefix = bool;
|
||||
@ -81,6 +80,13 @@ impl Operation {
|
||||
Self::Consecutive(ops)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&self) -> Option<&Query> {
|
||||
match self {
|
||||
Operation::Query(query) => Some(query),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Eq, PartialEq, Hash)]
|
||||
@ -96,14 +102,26 @@ pub enum QueryKind {
|
||||
}
|
||||
|
||||
impl QueryKind {
|
||||
fn exact(word: String) -> Self {
|
||||
pub fn exact(word: String) -> Self {
|
||||
QueryKind::Exact { original_typo: 0, word }
|
||||
}
|
||||
|
||||
fn tolerant(typo: u8, word: String) -> Self {
|
||||
pub fn exact_with_typo(original_typo: u8, word: String) -> Self {
|
||||
QueryKind::Exact { original_typo, word }
|
||||
}
|
||||
|
||||
pub fn tolerant(typo: u8, word: String) -> Self {
|
||||
QueryKind::Tolerant { typo, word }
|
||||
}
|
||||
|
||||
pub fn is_tolerant(&self) -> bool {
|
||||
matches!(self, QueryKind::Tolerant { .. })
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> bool {
|
||||
matches!(self, QueryKind::Exact { .. })
|
||||
}
|
||||
|
||||
pub fn typo(&self) -> u8 {
|
||||
match self {
|
||||
QueryKind::Tolerant { typo, .. } => *typo,
|
||||
@ -266,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
|
||||
}
|
||||
|
||||
/// The query tree builder is the interface to build a query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: BTreeMap<String, IsPrefix>
|
||||
dfas: Vec<(DFA, u8)>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
/// List all words which can be considered as a match for the query tree.
|
||||
pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self {
|
||||
Self { inner: fetch_words(tree, fst).into_iter().collect() }
|
||||
pub fn from_query_tree(tree: &Operation) -> Self {
|
||||
Self {
|
||||
dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the word match.
|
||||
pub fn is_match(&self, word: &str) -> bool {
|
||||
fn first_char(s: &str) -> Option<&str> {
|
||||
s.chars().next().map(|c| &s[..c.len_utf8()])
|
||||
}
|
||||
|
||||
match first_char(word) {
|
||||
Some(first) => {
|
||||
let left = first.to_owned();
|
||||
let right = word.to_owned();
|
||||
self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word)
|
||||
},
|
||||
None => false
|
||||
}
|
||||
pub fn matches(&self, word: &str) -> bool {
|
||||
self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) {
|
||||
Distance::Exact(t) => t <= *typo,
|
||||
Distance::AtLeast(_) => false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type FetchedWords = Vec<(String, IsPrefix)>;
|
||||
|
||||
/// Lists all words which can be considered as a match for the query tree.
|
||||
fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect()
|
||||
}
|
||||
|
||||
fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
match query.kind.clone() {
|
||||
QueryKind::Exact { word, .. } => vec![(word, query.prefix)],
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) {
|
||||
words.into_iter().map(|(w, _)| (w, query.prefix)).collect()
|
||||
} else {
|
||||
vec![(word, query.prefix)]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
||||
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
||||
match tree {
|
||||
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
|
||||
resolve_branch(ops.as_slice(), fst)
|
||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
||||
},
|
||||
Operation::Query(ops) => {
|
||||
resolve_query(ops, fst)
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||
out.insert((kind.word(), typo, *prefix));
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut words = resolve_ops(tree, fst);
|
||||
words.sort_unstable();
|
||||
words.dedup();
|
||||
words
|
||||
let mut queries = HashSet::new();
|
||||
resolve_ops(tree, &mut queries);
|
||||
queries
|
||||
}
|
||||
|
||||
/// Main function that creates the final query tree from the primitive query.
|
||||
@ -537,7 +531,10 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
|
||||
use Operation::{Or, And, Query, Consecutive};
|
||||
match operation {
|
||||
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
|
||||
And(ops) => ops.len().saturating_sub(1) * 8,
|
||||
And(ops) => {
|
||||
ops.iter().map(maximum_proximity).sum::<usize>()
|
||||
+ ops.len().saturating_sub(1) * 7
|
||||
},
|
||||
Query(_) | Consecutive(_) => 0,
|
||||
}
|
||||
}
|
||||
@ -547,7 +544,7 @@ mod test {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fst::Set;
|
||||
use maplit::hashmap;
|
||||
use maplit::{hashmap, hashset};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||
|
||||
@ -958,26 +955,26 @@ mod test {
|
||||
let context = TestContext::default();
|
||||
let query_tree = context.build(false, true, tokens).unwrap().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
("city".to_string(), false),
|
||||
("earth".to_string(), false),
|
||||
("nature".to_string(), false),
|
||||
("new".to_string(), false),
|
||||
("nyc".to_string(), false),
|
||||
("split".to_string(), false),
|
||||
("word".to_string(), false),
|
||||
("word".to_string(), true),
|
||||
("world".to_string(), true),
|
||||
("york".to_string(), false),
|
||||
|
||||
];
|
||||
let expected = hashset!{
|
||||
("word", 0, false),
|
||||
("nyc", 0, false),
|
||||
("wordsplit", 2, false),
|
||||
("wordsplitnycworld", 2, true),
|
||||
("nature", 0, false),
|
||||
("new", 0, false),
|
||||
("city", 0, false),
|
||||
("world", 1, true),
|
||||
("york", 0, false),
|
||||
("split", 0, false),
|
||||
("nycworld", 1, true),
|
||||
("earth", 0, false),
|
||||
("wordsplitnyc", 2, false),
|
||||
};
|
||||
|
||||
let mut keys = context.postings.keys().collect::<Vec<_>>();
|
||||
keys.sort_unstable();
|
||||
let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
|
||||
|
||||
let words = fetch_words(&query_tree, &set);
|
||||
|
||||
let words = fetch_queries(&query_tree);
|
||||
assert_eq!(expected, words);
|
||||
}
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
||||
use heed::BytesEncode;
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use log::{debug, info};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
@ -274,13 +274,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
|
||||
|
||||
// We store document_id associated with all the words the record contains.
|
||||
for (word, _) in words_positions.drain() {
|
||||
self.insert_word_docid(&word, document_id)?;
|
||||
for (word, _) in words_positions.iter() {
|
||||
self.insert_word_docid(word, document_id)?;
|
||||
}
|
||||
|
||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
|
||||
|
||||
words_positions.clear();
|
||||
|
||||
// We store document_id associated with all the field id and values.
|
||||
for (field, values) in facet_values.drain() {
|
||||
for value in values {
|
||||
@ -471,14 +473,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
};
|
||||
|
||||
let analyzed = self.analyzer.analyze(&content);
|
||||
let tokens = analyzed
|
||||
.tokens()
|
||||
.filter(|t| t.is_word())
|
||||
.map(|t| t.text().to_string());
|
||||
let tokens = process_tokens(analyzed.tokens());
|
||||
|
||||
for (pos, word) in tokens.enumerate().take(MAX_POSITION) {
|
||||
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
||||
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -609,6 +608,36 @@ enum FacetValue {
|
||||
Integer(i64),
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standart proximity of 1 between words.
|
||||
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> {
|
||||
use FacetValue::*;
|
||||
|
||||
|
@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
threshold: 0.01, // 1%
|
||||
threshold: 0.1 / 100.0, // .01%
|
||||
max_prefix_length: 4,
|
||||
_update_id: update_id,
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user