mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
WIP on split words and synonyms support
This commit is contained in:
parent
c27ea2677f
commit
1db152046e
@ -31,22 +31,27 @@ pub fn make_query_graph<'transaction>(
|
|||||||
query: &str,
|
query: &str,
|
||||||
) -> Result<QueryGraph> {
|
) -> Result<QueryGraph> {
|
||||||
assert!(!query.is_empty());
|
assert!(!query.is_empty());
|
||||||
let fst = index.words_fst(txn).unwrap();
|
let authorize_typos = index.authorize_typos(txn)?;
|
||||||
let query = LocatedQueryTerm::from_query(query.tokenize(), None, |word, is_prefix| {
|
let min_len_one_typo = index.min_word_len_one_typo(txn)?;
|
||||||
word_derivations(
|
let min_len_two_typos = index.min_word_len_two_typos(txn)?;
|
||||||
index,
|
|
||||||
txn,
|
let exact_words = index.exact_words(txn)?;
|
||||||
word,
|
let fst = index.words_fst(txn)?;
|
||||||
if word.len() < 4 {
|
|
||||||
|
// TODO: get rid of this closure
|
||||||
|
// also, ngrams can have one typo?
|
||||||
|
let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| {
|
||||||
|
let typos = if !authorize_typos
|
||||||
|
|| word.len() < min_len_one_typo as usize
|
||||||
|
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
|
||||||
|
{
|
||||||
0
|
0
|
||||||
} else if word.len() < 100 {
|
} else if word.len() < min_len_two_typos as usize {
|
||||||
1
|
1
|
||||||
} else {
|
} else {
|
||||||
2
|
2
|
||||||
},
|
};
|
||||||
is_prefix,
|
word_derivations(index, txn, word, typos, is_prefix, &fst)
|
||||||
&fst,
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let graph = QueryGraph::from_query(index, txn, db_cache, query)?;
|
let graph = QueryGraph::from_query(index, txn, db_cache, query)?;
|
||||||
|
@ -7,7 +7,7 @@ use super::db_cache::DatabaseCache;
|
|||||||
use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum QueryNode {
|
pub enum QueryNode {
|
||||||
Term(LocatedQueryTerm),
|
Term(LocatedQueryTerm),
|
||||||
Deleted,
|
Deleted,
|
||||||
@ -31,7 +31,7 @@ pub struct QueryGraph {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn _assert_sizes() {
|
fn _assert_sizes() {
|
||||||
let _: [u8; 112] = [0; std::mem::size_of::<QueryNode>()];
|
let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
|
||||||
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
|
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,6 +116,8 @@ impl QueryGraph {
|
|||||||
one_typo: vec![],
|
one_typo: vec![],
|
||||||
two_typos: vec![],
|
two_typos: vec![],
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
|
synonyms: vec![], // TODO: ngram synonyms
|
||||||
|
split_words: None, // TODO: maybe ngram split words?
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
positions: ngram2_pos,
|
positions: ngram2_pos,
|
||||||
@ -141,6 +143,8 @@ impl QueryGraph {
|
|||||||
one_typo: vec![],
|
one_typo: vec![],
|
||||||
two_typos: vec![],
|
two_typos: vec![],
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
|
synonyms: vec![], // TODO: ngram synonyms
|
||||||
|
split_words: None, // TODO: maybe ngram split words?
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
positions: ngram3_pos,
|
positions: ngram3_pos,
|
||||||
@ -188,19 +192,20 @@ impl QueryGraph {
|
|||||||
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() };
|
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn remove_words_at_position(&mut self, position: i8) {
|
pub fn remove_words_at_position(&mut self, position: i8) -> bool {
|
||||||
let mut nodes_to_remove_keeping_edges = vec![];
|
let mut nodes_to_remove_keeping_edges = vec![];
|
||||||
for (node_idx, node) in self.nodes.iter().enumerate() {
|
for (node_idx, node) in self.nodes.iter().enumerate() {
|
||||||
let node_idx = node_idx as u32;
|
let node_idx = node_idx as u32;
|
||||||
let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue };
|
let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue };
|
||||||
if positions.start() == &position {
|
if positions.start() == &position {
|
||||||
nodes_to_remove_keeping_edges.push(node_idx)
|
nodes_to_remove_keeping_edges.push(node_idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges);
|
self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges);
|
||||||
|
|
||||||
self.simplify();
|
self.simplify();
|
||||||
|
!nodes_to_remove_keeping_edges.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn simplify(&mut self) {
|
fn simplify(&mut self) {
|
||||||
@ -223,80 +228,3 @@ impl QueryGraph {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl Debug for QueryNode {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
match self {
|
|
||||||
QueryNode::Term(term @ LocatedQueryTerm { value, positions: _ }) => match value {
|
|
||||||
QueryTerm::Word {
|
|
||||||
derivations:
|
|
||||||
WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db },
|
|
||||||
} => {
|
|
||||||
if term.is_empty() {
|
|
||||||
write!(f, "\"{original} (∅)\"")
|
|
||||||
} else {
|
|
||||||
let derivations = std::iter::once(original.clone())
|
|
||||||
.chain(zero_typo.iter().map(|s| format!("T0 .. {s}")))
|
|
||||||
.chain(one_typo.iter().map(|s| format!("T1 .. {s}")))
|
|
||||||
.chain(two_typos.iter().map(|s| format!("T2 .. {s}")))
|
|
||||||
.collect::<Vec<String>>()
|
|
||||||
.join(" | ");
|
|
||||||
|
|
||||||
write!(f, "\"{derivations}")?;
|
|
||||||
if *use_prefix_db {
|
|
||||||
write!(f, " | +prefix_db")?;
|
|
||||||
}
|
|
||||||
write!(f, " | pos:{}..={}", term.positions.start(), term.positions.end())?;
|
|
||||||
write!(f, "\"")?;
|
|
||||||
/*
|
|
||||||
"beautiful" [label = "<f0> beautiful | beauiful | beautifol"]
|
|
||||||
*/
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
QueryTerm::Phrase(ws) => {
|
|
||||||
let joined =
|
|
||||||
ws.iter().filter_map(|x| x.clone()).collect::<Vec<String>>().join(" ");
|
|
||||||
let in_quotes = format!("\"{joined}\"");
|
|
||||||
let escaped = in_quotes.escape_default().collect::<String>();
|
|
||||||
write!(f, "\"{escaped}\"")
|
|
||||||
}
|
|
||||||
},
|
|
||||||
QueryNode::Start => write!(f, "\"START\""),
|
|
||||||
QueryNode::End => write!(f, "\"END\""),
|
|
||||||
QueryNode::Deleted => write!(f, "\"_deleted_\""),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl QueryGraph {
|
|
||||||
pub fn graphviz(&self) -> String {
|
|
||||||
let mut desc = String::new();
|
|
||||||
desc.push_str(
|
|
||||||
r#"
|
|
||||||
digraph G {
|
|
||||||
rankdir = LR;
|
|
||||||
node [shape = "record"]
|
|
||||||
"#,
|
|
||||||
);
|
|
||||||
|
|
||||||
for node in 0..self.nodes.len() {
|
|
||||||
if matches!(self.nodes[node], QueryNode::Deleted) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],));
|
|
||||||
if node == self.root_node as usize {
|
|
||||||
desc.push_str("[color = blue]");
|
|
||||||
} else if node == self.end_node as usize {
|
|
||||||
desc.push_str("[color = red]");
|
|
||||||
}
|
|
||||||
desc.push_str(";\n");
|
|
||||||
|
|
||||||
for edge in self.edges[node].successors.iter() {
|
|
||||||
desc.push_str(&format!("{node} -> {edge};\n"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
desc.push('}');
|
|
||||||
desc
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -10,14 +10,28 @@ use fst::automaton::Str;
|
|||||||
use fst::{Automaton, IntoStreamer, Streamer};
|
use fst::{Automaton, IntoStreamer, Streamer};
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::DecodeIgnore;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
use crate::search::{build_dfa, get_first};
|
use crate::search::{build_dfa, get_first};
|
||||||
use crate::{Index, Result};
|
use crate::{CboRoaringBitmapLenCodec, Index, Result};
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone)]
|
||||||
|
pub struct Phrase {
|
||||||
|
pub words: Vec<Option<String>>,
|
||||||
|
}
|
||||||
|
impl Phrase {
|
||||||
|
pub fn description(&self) -> String {
|
||||||
|
self.words.iter().flatten().join(" ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct WordDerivations {
|
pub struct WordDerivations {
|
||||||
pub original: String,
|
pub original: String,
|
||||||
|
// TODO: pub prefix_of: Vec<String>,
|
||||||
|
pub synonyms: Vec<Phrase>,
|
||||||
|
pub split_words: Option<(String, String)>,
|
||||||
pub zero_typo: Vec<String>,
|
pub zero_typo: Vec<String>,
|
||||||
pub one_typo: Vec<String>,
|
pub one_typo: Vec<String>,
|
||||||
pub two_typos: Vec<String>,
|
pub two_typos: Vec<String>,
|
||||||
@ -114,19 +128,63 @@ pub fn word_derivations(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
let split_words = split_best_frequency(index, txn, word)?;
|
||||||
|
|
||||||
Ok(WordDerivations { original: word.to_owned(), zero_typo, one_typo, two_typos, use_prefix_db })
|
let synonyms = index.synonyms(txn)?;
|
||||||
|
let synonyms = synonyms
|
||||||
|
.get(&vec![word.to_owned()])
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.map(|words| Phrase { words: words.into_iter().map(Some).collect() })
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(WordDerivations {
|
||||||
|
original: word.to_owned(),
|
||||||
|
synonyms,
|
||||||
|
split_words,
|
||||||
|
zero_typo,
|
||||||
|
one_typo,
|
||||||
|
two_typos,
|
||||||
|
use_prefix_db,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split_best_frequency(
|
||||||
|
index: &Index,
|
||||||
|
txn: &RoTxn,
|
||||||
|
original: &str,
|
||||||
|
) -> Result<Option<(String, String)>> {
|
||||||
|
let chars = original.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = original.split_at(i);
|
||||||
|
|
||||||
|
let key = (1, left, right);
|
||||||
|
let frequency = index
|
||||||
|
.word_pair_proximity_docids
|
||||||
|
.remap_data_type::<CboRoaringBitmapLenCodec>()
|
||||||
|
.get(txn, &key)?
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
|
best = Some((frequency, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum QueryTerm {
|
pub enum QueryTerm {
|
||||||
Phrase(Vec<Option<String>>),
|
Phrase { phrase: Phrase },
|
||||||
Word { derivations: WordDerivations },
|
Word { derivations: WordDerivations },
|
||||||
}
|
}
|
||||||
impl QueryTerm {
|
impl QueryTerm {
|
||||||
pub fn original_single_word(&self) -> Option<&str> {
|
pub fn original_single_word(&self) -> Option<&str> {
|
||||||
match self {
|
match self {
|
||||||
QueryTerm::Phrase(_) => None,
|
QueryTerm::Phrase { phrase: _ } => None,
|
||||||
QueryTerm::Word { derivations } => {
|
QueryTerm::Word { derivations } => {
|
||||||
if derivations.is_empty() {
|
if derivations.is_empty() {
|
||||||
None
|
None
|
||||||
@ -140,14 +198,14 @@ impl QueryTerm {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct LocatedQueryTerm {
|
pub struct LocatedQueryTerm {
|
||||||
pub value: QueryTerm, // value should be able to contain the word derivations as well
|
pub value: QueryTerm,
|
||||||
pub positions: RangeInclusive<i8>,
|
pub positions: RangeInclusive<i8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LocatedQueryTerm {
|
impl LocatedQueryTerm {
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
match &self.value {
|
match &self.value {
|
||||||
QueryTerm::Phrase(_) => false,
|
QueryTerm::Phrase { phrase: _ } => false,
|
||||||
QueryTerm::Word { derivations, .. } => derivations.is_empty(),
|
QueryTerm::Word { derivations, .. } => derivations.is_empty(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -156,6 +214,7 @@ impl LocatedQueryTerm {
|
|||||||
pub fn from_query(
|
pub fn from_query(
|
||||||
query: NormalizedTokenIter<Vec<u8>>,
|
query: NormalizedTokenIter<Vec<u8>>,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
|
// TODO:` use index + txn + ? instead of closure
|
||||||
derivations: impl Fn(&str, bool) -> Result<WordDerivations>,
|
derivations: impl Fn(&str, bool) -> Result<WordDerivations>,
|
||||||
) -> Result<Vec<LocatedQueryTerm>> {
|
) -> Result<Vec<LocatedQueryTerm>> {
|
||||||
let mut primitive_query = Vec::new();
|
let mut primitive_query = Vec::new();
|
||||||
@ -232,7 +291,9 @@ impl LocatedQueryTerm {
|
|||||||
&& (quote_count > 0 || separator_kind == SeparatorKind::Hard)
|
&& (quote_count > 0 || separator_kind == SeparatorKind::Hard)
|
||||||
{
|
{
|
||||||
let located_query_term = LocatedQueryTerm {
|
let located_query_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Phrase(mem::take(&mut phrase)),
|
value: QueryTerm::Phrase {
|
||||||
|
phrase: Phrase { words: mem::take(&mut phrase) },
|
||||||
|
},
|
||||||
positions: phrase_start..=phrase_end,
|
positions: phrase_start..=phrase_end,
|
||||||
};
|
};
|
||||||
primitive_query.push(located_query_term);
|
primitive_query.push(located_query_term);
|
||||||
@ -245,7 +306,7 @@ impl LocatedQueryTerm {
|
|||||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||||
if !phrase.is_empty() {
|
if !phrase.is_empty() {
|
||||||
let located_query_term = LocatedQueryTerm {
|
let located_query_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Phrase(mem::take(&mut phrase)),
|
value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
|
||||||
positions: phrase_start..=phrase_end,
|
positions: phrase_start..=phrase_end,
|
||||||
};
|
};
|
||||||
primitive_query.push(located_query_term);
|
primitive_query.push(located_query_term);
|
||||||
|
@ -5,9 +5,10 @@ use heed::{BytesDecode, RoTxn};
|
|||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
use super::db_cache::DatabaseCache;
|
||||||
use super::query_term::{QueryTerm, WordDerivations};
|
use super::query_term::{Phrase, QueryTerm, WordDerivations};
|
||||||
use super::QueryGraph;
|
use super::{QueryGraph, QueryNode};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
|
||||||
|
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@ -27,14 +28,19 @@ impl NodeDocIdsCache {
|
|||||||
return Ok(&self.cache[&node_idx]);
|
return Ok(&self.cache[&node_idx]);
|
||||||
};
|
};
|
||||||
let docids = match term {
|
let docids = match term {
|
||||||
QueryTerm::Phrase(_) => {
|
QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,
|
||||||
todo!("resolve phrase")
|
|
||||||
}
|
|
||||||
QueryTerm::Word {
|
QueryTerm::Word {
|
||||||
derivations:
|
derivations:
|
||||||
WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db },
|
WordDerivations {
|
||||||
|
original,
|
||||||
|
zero_typo,
|
||||||
|
one_typo,
|
||||||
|
two_typos,
|
||||||
|
use_prefix_db,
|
||||||
|
synonyms,
|
||||||
|
split_words,
|
||||||
|
},
|
||||||
} => {
|
} => {
|
||||||
let derivations_docids = {
|
|
||||||
let mut or_docids = vec![];
|
let mut or_docids = vec![];
|
||||||
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
|
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
|
||||||
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
|
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
|
||||||
@ -48,12 +54,23 @@ impl NodeDocIdsCache {
|
|||||||
or_docids.push(prefix_docids);
|
or_docids.push(prefix_docids);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
or_docids
|
let mut docids = or_docids
|
||||||
};
|
|
||||||
let derivations_iter = derivations_docids
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap());
|
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
|
||||||
MultiOps::union(derivations_iter)
|
.collect::<Vec<_>>();
|
||||||
|
for synonym in synonyms {
|
||||||
|
// TODO: cache resolve_phrase?
|
||||||
|
docids.push(resolve_phrase(index, txn, db_cache, synonym)?);
|
||||||
|
}
|
||||||
|
if let Some((left, right)) = split_words {
|
||||||
|
if let Some(split_word_docids) =
|
||||||
|
db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?
|
||||||
|
{
|
||||||
|
docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MultiOps::union(docids)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let _ = self.cache.insert(node_idx, docids);
|
let _ = self.cache.insert(node_idx, docids);
|
||||||
@ -90,19 +107,19 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
let predecessors_docids = MultiOps::union(predecessors_iter);
|
let predecessors_docids = MultiOps::union(predecessors_iter);
|
||||||
|
|
||||||
let n = &q.nodes[node as usize];
|
let n = &q.nodes[node as usize];
|
||||||
// println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}");
|
|
||||||
let node_docids = match n {
|
let node_docids = match n {
|
||||||
super::QueryNode::Term(located_term) => {
|
QueryNode::Term(located_term) => {
|
||||||
let term = &located_term.value;
|
let term = &located_term.value;
|
||||||
let derivations_docids =
|
let derivations_docids =
|
||||||
node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
|
node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
|
||||||
predecessors_docids & derivations_docids
|
predecessors_docids & derivations_docids
|
||||||
}
|
}
|
||||||
super::QueryNode::Deleted => {
|
QueryNode::Deleted => {
|
||||||
panic!()
|
panic!()
|
||||||
}
|
}
|
||||||
super::QueryNode::Start => universe.clone(),
|
QueryNode::Start => universe.clone(),
|
||||||
super::QueryNode::End => {
|
QueryNode::End => {
|
||||||
return Ok(predecessors_docids);
|
return Ok(predecessors_docids);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -125,3 +142,80 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
|
|
||||||
panic!()
|
panic!()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn resolve_phrase<'transaction>(
|
||||||
|
index: &Index,
|
||||||
|
txn: &'transaction RoTxn,
|
||||||
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
|
phrase: &Phrase,
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let Phrase { words } = phrase;
|
||||||
|
let mut candidates = RoaringBitmap::new();
|
||||||
|
let mut first_iter = true;
|
||||||
|
let winsize = words.len().min(3);
|
||||||
|
|
||||||
|
if words.is_empty() {
|
||||||
|
return Ok(candidates);
|
||||||
|
}
|
||||||
|
|
||||||
|
for win in words.windows(winsize) {
|
||||||
|
// Get all the documents with the matching distance for each word pairs.
|
||||||
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||||
|
for (offset, s1) in win
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||||
|
{
|
||||||
|
for (dist, s2) in win
|
||||||
|
.iter()
|
||||||
|
.skip(offset + 1)
|
||||||
|
.enumerate()
|
||||||
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||||
|
{
|
||||||
|
if dist == 0 {
|
||||||
|
match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {
|
||||||
|
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
|
||||||
|
// If there are no documents for this pair, there will be no
|
||||||
|
// results for the phrase query.
|
||||||
|
None => return Ok(RoaringBitmap::new()),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
for dist in 0..=dist {
|
||||||
|
if let Some(m) = db_cache.get_word_pair_proximity_docids(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
s1,
|
||||||
|
s2,
|
||||||
|
dist as u8 + 1,
|
||||||
|
)? {
|
||||||
|
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if bitmap.is_empty() {
|
||||||
|
return Ok(bitmap);
|
||||||
|
} else {
|
||||||
|
bitmaps.push(bitmap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We sort the bitmaps so that we perform the small intersections first, which is faster.
|
||||||
|
bitmaps.sort_unstable_by_key(|a| a.len());
|
||||||
|
|
||||||
|
for bitmap in bitmaps {
|
||||||
|
if first_iter {
|
||||||
|
candidates = bitmap;
|
||||||
|
first_iter = false;
|
||||||
|
} else {
|
||||||
|
candidates &= bitmap;
|
||||||
|
}
|
||||||
|
// There will be no match, return early
|
||||||
|
if candidates.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(candidates)
|
||||||
|
}
|
||||||
|
@ -99,14 +99,17 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
let child_query_graph = query_graph.clone();
|
let child_query_graph = query_graph.clone();
|
||||||
// TODO: Check whether a position exists in the graph before removing it and
|
loop {
|
||||||
// returning the next bucket.
|
|
||||||
// while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() }
|
|
||||||
if self.positions_to_remove.is_empty() {
|
if self.positions_to_remove.is_empty() {
|
||||||
self.exhausted = true;
|
self.exhausted = true;
|
||||||
|
break;
|
||||||
} else {
|
} else {
|
||||||
let position_to_remove = self.positions_to_remove.pop().unwrap();
|
let position_to_remove = self.positions_to_remove.pop().unwrap();
|
||||||
query_graph.remove_words_at_position(position_to_remove);
|
let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove);
|
||||||
|
if did_delete_any_node {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
|
Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
|
||||||
|
Loading…
Reference in New Issue
Block a user