mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Intern all strings and phrases in the search logic
This commit is contained in:
parent
3f1729a17f
commit
e8c76cf7bf
@ -1,51 +1,48 @@
|
|||||||
use std::collections::hash_map::Entry;
|
use super::{interner::Interned, SearchContext};
|
||||||
|
use crate::Result;
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::RoTxn;
|
use std::collections::hash_map::Entry;
|
||||||
|
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct DatabaseCache<'transaction> {
|
pub struct DatabaseCache<'search> {
|
||||||
pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
|
// TODO: interner for all database cache keys
|
||||||
|
pub word_pair_proximity_docids:
|
||||||
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
||||||
pub word_prefix_pair_proximity_docids:
|
pub word_prefix_pair_proximity_docids:
|
||||||
FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
||||||
pub prefix_word_pair_proximity_docids:
|
pub prefix_word_pair_proximity_docids:
|
||||||
FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
||||||
pub word_docids: FxHashMap<String, Option<&'transaction [u8]>>,
|
pub word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
||||||
pub exact_word_docids: FxHashMap<String, Option<&'transaction [u8]>>,
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
||||||
pub word_prefix_docids: FxHashMap<String, Option<&'transaction [u8]>>,
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
||||||
}
|
}
|
||||||
impl<'transaction> DatabaseCache<'transaction> {
|
impl<'search> SearchContext<'search> {
|
||||||
pub fn get_word_docids(
|
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
|
||||||
&mut self,
|
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
|
||||||
index: &Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
word: &str,
|
|
||||||
) -> Result<Option<&'transaction [u8]>> {
|
|
||||||
let bitmap_ptr = match self.word_docids.entry(word.to_owned()) {
|
|
||||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
let bitmap_ptr = index.word_docids.remap_data_type::<ByteSlice>().get(txn, word)?;
|
let bitmap_ptr = self
|
||||||
|
.index
|
||||||
|
.word_docids
|
||||||
|
.remap_data_type::<ByteSlice>()
|
||||||
|
.get(self.txn, self.word_interner.get(word))?;
|
||||||
entry.insert(bitmap_ptr);
|
entry.insert(bitmap_ptr);
|
||||||
bitmap_ptr
|
bitmap_ptr
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(bitmap_ptr)
|
Ok(bitmap_ptr)
|
||||||
}
|
}
|
||||||
pub fn get_prefix_docids(
|
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
|
||||||
&mut self,
|
|
||||||
index: &Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
prefix: &str,
|
|
||||||
) -> Result<Option<&'transaction [u8]>> {
|
|
||||||
// In the future, this will be a frozen roaring bitmap
|
// In the future, this will be a frozen roaring bitmap
|
||||||
let bitmap_ptr = match self.word_prefix_docids.entry(prefix.to_owned()) {
|
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
|
||||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
let bitmap_ptr =
|
let bitmap_ptr = self
|
||||||
index.word_prefix_docids.remap_data_type::<ByteSlice>().get(txn, prefix)?;
|
.index
|
||||||
|
.word_prefix_docids
|
||||||
|
.remap_data_type::<ByteSlice>()
|
||||||
|
.get(self.txn, self.word_interner.get(prefix))?;
|
||||||
entry.insert(bitmap_ptr);
|
entry.insert(bitmap_ptr);
|
||||||
bitmap_ptr
|
bitmap_ptr
|
||||||
}
|
}
|
||||||
@ -55,14 +52,12 @@ impl<'transaction> DatabaseCache<'transaction> {
|
|||||||
|
|
||||||
pub fn get_word_pair_proximity_docids(
|
pub fn get_word_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
word1: Interned<String>,
|
||||||
txn: &'transaction RoTxn,
|
word2: Interned<String>,
|
||||||
word1: &str,
|
|
||||||
word2: &str,
|
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'transaction [u8]>> {
|
) -> Result<Option<&'search [u8]>> {
|
||||||
let key = (proximity, word1.to_owned(), word2.to_owned());
|
let key = (proximity, word1, word2);
|
||||||
match self.word_pair_proximity_docids.entry(key.clone()) {
|
match self.db_cache.word_pair_proximity_docids.entry(key) {
|
||||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
// We shouldn't greedily access this DB at all
|
// We shouldn't greedily access this DB at all
|
||||||
@ -86,10 +81,11 @@ impl<'transaction> DatabaseCache<'transaction> {
|
|||||||
// output.push(word1, word2, proximities);
|
// output.push(word1, word2, proximities);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
let bitmap_ptr = index
|
let bitmap_ptr =
|
||||||
.word_pair_proximity_docids
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
|
||||||
.remap_data_type::<ByteSlice>()
|
self.txn,
|
||||||
.get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?;
|
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
||||||
|
)?;
|
||||||
entry.insert(bitmap_ptr);
|
entry.insert(bitmap_ptr);
|
||||||
Ok(bitmap_ptr)
|
Ok(bitmap_ptr)
|
||||||
}
|
}
|
||||||
@ -98,20 +94,22 @@ impl<'transaction> DatabaseCache<'transaction> {
|
|||||||
|
|
||||||
pub fn get_word_prefix_pair_proximity_docids(
|
pub fn get_word_prefix_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
word1: Interned<String>,
|
||||||
txn: &'transaction RoTxn,
|
prefix2: Interned<String>,
|
||||||
word1: &str,
|
|
||||||
prefix2: &str,
|
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'transaction [u8]>> {
|
) -> Result<Option<&'search [u8]>> {
|
||||||
let key = (proximity, word1.to_owned(), prefix2.to_owned());
|
let key = (proximity, word1, prefix2);
|
||||||
match self.word_prefix_pair_proximity_docids.entry(key.clone()) {
|
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
|
||||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
let bitmap_ptr = index
|
let bitmap_ptr = self
|
||||||
|
.index
|
||||||
.word_prefix_pair_proximity_docids
|
.word_prefix_pair_proximity_docids
|
||||||
.remap_data_type::<ByteSlice>()
|
.remap_data_type::<ByteSlice>()
|
||||||
.get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?;
|
.get(
|
||||||
|
self.txn,
|
||||||
|
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
||||||
|
)?;
|
||||||
entry.insert(bitmap_ptr);
|
entry.insert(bitmap_ptr);
|
||||||
Ok(bitmap_ptr)
|
Ok(bitmap_ptr)
|
||||||
}
|
}
|
||||||
@ -119,20 +117,26 @@ impl<'transaction> DatabaseCache<'transaction> {
|
|||||||
}
|
}
|
||||||
pub fn get_prefix_word_pair_proximity_docids(
|
pub fn get_prefix_word_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
left_prefix: Interned<String>,
|
||||||
txn: &'transaction RoTxn,
|
right: Interned<String>,
|
||||||
left_prefix: &str,
|
|
||||||
right: &str,
|
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'transaction [u8]>> {
|
) -> Result<Option<&'search [u8]>> {
|
||||||
let key = (proximity, left_prefix.to_owned(), right.to_owned());
|
let key = (proximity, left_prefix, right);
|
||||||
match self.prefix_word_pair_proximity_docids.entry(key) {
|
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
|
||||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
let bitmap_ptr = index
|
let bitmap_ptr = self
|
||||||
|
.index
|
||||||
.prefix_word_pair_proximity_docids
|
.prefix_word_pair_proximity_docids
|
||||||
.remap_data_type::<ByteSlice>()
|
.remap_data_type::<ByteSlice>()
|
||||||
.get(txn, &(proximity, left_prefix, right))?;
|
.get(
|
||||||
|
self.txn,
|
||||||
|
&(
|
||||||
|
proximity,
|
||||||
|
self.word_interner.get(left_prefix),
|
||||||
|
self.word_interner.get(right),
|
||||||
|
),
|
||||||
|
)?;
|
||||||
entry.insert(bitmap_ptr);
|
entry.insert(bitmap_ptr);
|
||||||
Ok(bitmap_ptr)
|
Ok(bitmap_ptr)
|
||||||
}
|
}
|
||||||
|
@ -1,15 +1,11 @@
|
|||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::ranking_rule_graph::EdgeDocidsCache;
|
use super::ranking_rule_graph::EdgeDocidsCache;
|
||||||
use super::ranking_rule_graph::EmptyPathsCache;
|
use super::ranking_rule_graph::EmptyPathsCache;
|
||||||
|
|
||||||
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
|
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
|
use super::SearchContext;
|
||||||
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
|
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
|
||||||
|
use crate::Result;
|
||||||
use crate::{Index, Result};
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||||
id: String,
|
id: String,
|
||||||
@ -29,12 +25,10 @@ pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
|||||||
cur_distance_idx: usize,
|
cur_distance_idx: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
|
fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
|
||||||
|
ctx: &mut SearchContext<'search>,
|
||||||
graph: &mut RankingRuleGraph<G>,
|
graph: &mut RankingRuleGraph<G>,
|
||||||
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
||||||
index: &Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
empty_paths_cache: &mut EmptyPathsCache,
|
empty_paths_cache: &mut EmptyPathsCache,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@ -42,8 +36,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
|
|||||||
if graph.all_edges[edge_index as usize].is_none() {
|
if graph.all_edges[edge_index as usize].is_none() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let docids = edge_docids_cache
|
let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?;
|
||||||
.get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?;
|
|
||||||
match docids {
|
match docids {
|
||||||
BitmapOrAllRef::Bitmap(bitmap) => {
|
BitmapOrAllRef::Bitmap(bitmap) => {
|
||||||
if bitmap.is_disjoint(universe) {
|
if bitmap.is_disjoint(universe) {
|
||||||
@ -59,7 +52,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph>
|
impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||||
for GraphBasedRankingRule<G>
|
for GraphBasedRankingRule<G>
|
||||||
{
|
{
|
||||||
fn id(&self) -> String {
|
fn id(&self) -> String {
|
||||||
@ -67,24 +60,20 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
}
|
}
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// TODO: update old state instead of starting from scratch
|
// TODO: update old state instead of starting from scratch
|
||||||
let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
|
let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
|
||||||
let mut edge_docids_cache = EdgeDocidsCache::default();
|
let mut edge_docids_cache = EdgeDocidsCache::default();
|
||||||
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len());
|
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len());
|
||||||
|
|
||||||
remove_empty_edges(
|
remove_empty_edges(
|
||||||
|
ctx,
|
||||||
&mut graph,
|
&mut graph,
|
||||||
&mut edge_docids_cache,
|
&mut edge_docids_cache,
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
universe,
|
universe,
|
||||||
&mut empty_paths_cache,
|
&mut empty_paths_cache,
|
||||||
)?;
|
)?;
|
||||||
@ -105,20 +94,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
|
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
assert!(universe.len() > 1);
|
assert!(universe.len() > 1);
|
||||||
let mut state = self.state.take().unwrap();
|
let mut state = self.state.take().unwrap();
|
||||||
remove_empty_edges(
|
remove_empty_edges(
|
||||||
|
ctx,
|
||||||
&mut state.graph,
|
&mut state.graph,
|
||||||
&mut state.edge_docids_cache,
|
&mut state.edge_docids_cache,
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
universe,
|
universe,
|
||||||
&mut state.empty_paths_cache,
|
&mut state.empty_paths_cache,
|
||||||
)?;
|
)?;
|
||||||
@ -151,9 +136,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
);
|
);
|
||||||
|
|
||||||
let bucket = state.graph.resolve_paths(
|
let bucket = state.graph.resolve_paths(
|
||||||
index,
|
ctx,
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
&mut state.edge_docids_cache,
|
&mut state.edge_docids_cache,
|
||||||
&mut state.empty_paths_cache,
|
&mut state.empty_paths_cache,
|
||||||
universe,
|
universe,
|
||||||
@ -169,9 +152,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
|
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) {
|
) {
|
||||||
self.state = None;
|
self.state = None;
|
||||||
|
78
milli/src/search/new/interner.rs
Normal file
78
milli/src/search/new/interner.rs
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
use fxhash::FxHashMap;
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
|
pub struct Interned<T> {
|
||||||
|
idx: u32,
|
||||||
|
_phantom: PhantomData<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Interned<T> {
|
||||||
|
fn new(idx: u32) -> Self {
|
||||||
|
Self { idx, _phantom: PhantomData }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Interner<T> {
|
||||||
|
stable_store: Vec<T>,
|
||||||
|
lookup: FxHashMap<T, Interned<T>>,
|
||||||
|
}
|
||||||
|
impl<T> Default for Interner<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self { stable_store: Default::default(), lookup: Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Interner<T>
|
||||||
|
where
|
||||||
|
T: Clone + Eq + Hash,
|
||||||
|
{
|
||||||
|
pub fn insert(&mut self, s: T) -> Interned<T> {
|
||||||
|
if let Some(interned) = self.lookup.get(&s) {
|
||||||
|
*interned
|
||||||
|
} else {
|
||||||
|
self.stable_store.push(s.clone());
|
||||||
|
let interned = Interned::new(self.stable_store.len() as u32 - 1);
|
||||||
|
self.lookup.insert(s, interned);
|
||||||
|
interned
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||||
|
&self.stable_store[interned.idx as usize]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Interned<T> boilerplate implementations
|
||||||
|
|
||||||
|
impl<T> Hash for Interned<T> {
|
||||||
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||||
|
self.idx.hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Ord> Ord for Interned<T> {
|
||||||
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
|
self.idx.cmp(&other.idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> PartialOrd for Interned<T> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
|
self.idx.partial_cmp(&other.idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Eq for Interned<T> {}
|
||||||
|
|
||||||
|
impl<T> PartialEq for Interned<T> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.idx == other.idx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<T> Clone for Interned<T> {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self { idx: self.idx, _phantom: PhantomData }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Copy for Interned<T> {}
|
@ -6,7 +6,7 @@ use std::time::Instant;
|
|||||||
use std::{io::Write, path::PathBuf};
|
use std::{io::Write, path::PathBuf};
|
||||||
|
|
||||||
use crate::new::ranking_rule_graph::TypoGraph;
|
use crate::new::ranking_rule_graph::TypoGraph;
|
||||||
use crate::new::{QueryNode, QueryGraph};
|
use crate::new::{QueryNode, QueryGraph, SearchContext};
|
||||||
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
use crate::new::ranking_rule_graph::EmptyPathsCache;
|
use crate::new::ranking_rule_graph::EmptyPathsCache;
|
||||||
use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
|
use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
|
||||||
@ -176,7 +176,7 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DetailedSearchLogger {
|
impl DetailedSearchLogger {
|
||||||
pub fn write_d2_description(&self) {
|
pub fn write_d2_description(&self,ctx: &mut SearchContext,) {
|
||||||
let mut prev_time = self.initial_query_time.unwrap();
|
let mut prev_time = self.initial_query_time.unwrap();
|
||||||
let mut timestamp = vec![];
|
let mut timestamp = vec![];
|
||||||
fn activated_id(timestamp: &[usize]) -> String {
|
fn activated_id(timestamp: &[usize]) -> String {
|
||||||
@ -193,12 +193,12 @@ impl DetailedSearchLogger {
|
|||||||
writeln!(&mut file, "direction: right").unwrap();
|
writeln!(&mut file, "direction: right").unwrap();
|
||||||
writeln!(&mut file, "Initial Query Graph: {{").unwrap();
|
writeln!(&mut file, "Initial Query Graph: {{").unwrap();
|
||||||
let initial_query_graph = self.initial_query.as_ref().unwrap();
|
let initial_query_graph = self.initial_query.as_ref().unwrap();
|
||||||
Self::query_graph_d2_description(initial_query_graph, &mut file);
|
Self::query_graph_d2_description(ctx, initial_query_graph, &mut file);
|
||||||
writeln!(&mut file, "}}").unwrap();
|
writeln!(&mut file, "}}").unwrap();
|
||||||
|
|
||||||
writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap();
|
writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap();
|
||||||
let query_graph_for_universe = self.query_for_universe.as_ref().unwrap();
|
let query_graph_for_universe = self.query_for_universe.as_ref().unwrap();
|
||||||
Self::query_graph_d2_description(query_graph_for_universe, &mut file);
|
Self::query_graph_d2_description(ctx, query_graph_for_universe, &mut file);
|
||||||
writeln!(&mut file, "}}").unwrap();
|
writeln!(&mut file, "}}").unwrap();
|
||||||
|
|
||||||
let initial_universe = self.initial_universe.as_ref().unwrap();
|
let initial_universe = self.initial_universe.as_ref().unwrap();
|
||||||
@ -308,7 +308,7 @@ results.{random} {{
|
|||||||
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
||||||
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
||||||
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
||||||
Self::query_graph_d2_description(query_graph, &mut new_file);
|
Self::query_graph_d2_description(ctx, query_graph, &mut new_file);
|
||||||
writeln!(
|
writeln!(
|
||||||
&mut file,
|
&mut file,
|
||||||
"{id} {{
|
"{id} {{
|
||||||
@ -323,7 +323,7 @@ results.{random} {{
|
|||||||
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
||||||
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
||||||
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
||||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
Self::ranking_rule_graph_d2_description(ctx, graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
||||||
writeln!(
|
writeln!(
|
||||||
&mut file,
|
&mut file,
|
||||||
"{id} {{
|
"{id} {{
|
||||||
@ -339,7 +339,7 @@ results.{random} {{
|
|||||||
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
||||||
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
||||||
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
||||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
Self::ranking_rule_graph_d2_description(ctx,graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
||||||
writeln!(
|
writeln!(
|
||||||
&mut file,
|
&mut file,
|
||||||
"{id} {{
|
"{id} {{
|
||||||
@ -352,31 +352,40 @@ results.{random} {{
|
|||||||
writeln!(&mut file, "}}").unwrap();
|
writeln!(&mut file, "}}").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) {
|
fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) {
|
||||||
match &node {
|
match &node {
|
||||||
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
|
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
|
||||||
match value {
|
match value {
|
||||||
QueryTerm::Phrase { phrase } => {
|
QueryTerm::Phrase { phrase } => {
|
||||||
let phrase_str = phrase.description();
|
let phrase = ctx.phrase_interner.get(*phrase);
|
||||||
|
let phrase_str = phrase.description(&ctx.word_interner);
|
||||||
writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap();
|
writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap();
|
||||||
},
|
},
|
||||||
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => {
|
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => {
|
||||||
|
let original = ctx.word_interner.get(*original);
|
||||||
writeln!(file,"{node_idx} : \"{original}\" {{
|
writeln!(file,"{node_idx} : \"{original}\" {{
|
||||||
shape: class").unwrap();
|
shape: class").unwrap();
|
||||||
for w in zero_typo {
|
for w in zero_typo.iter().copied() {
|
||||||
|
let w = ctx.word_interner.get(w);
|
||||||
writeln!(file, "\"{w}\" : 0").unwrap();
|
writeln!(file, "\"{w}\" : 0").unwrap();
|
||||||
}
|
}
|
||||||
for w in one_typo {
|
for w in one_typo.iter().copied() {
|
||||||
|
let w = ctx.word_interner.get(w);
|
||||||
writeln!(file, "\"{w}\" : 1").unwrap();
|
writeln!(file, "\"{w}\" : 1").unwrap();
|
||||||
}
|
}
|
||||||
for w in two_typos {
|
for w in two_typos.iter().copied() {
|
||||||
|
let w = ctx.word_interner.get(w);
|
||||||
writeln!(file, "\"{w}\" : 2").unwrap();
|
writeln!(file, "\"{w}\" : 2").unwrap();
|
||||||
}
|
}
|
||||||
if let Some((left, right)) = split_words {
|
if let Some(split_words) = split_words {
|
||||||
writeln!(file, "\"{left} {right}\" : split_words").unwrap();
|
let phrase = ctx.phrase_interner.get(*split_words);
|
||||||
|
let phrase_str = phrase.description(&ctx.word_interner);
|
||||||
|
writeln!(file, "\"{phrase_str}\" : split_words").unwrap();
|
||||||
}
|
}
|
||||||
for synonym in synonyms {
|
for synonym in synonyms.iter().copied() {
|
||||||
writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap();
|
let phrase = ctx.phrase_interner.get(synonym);
|
||||||
|
let phrase_str = phrase.description(&ctx.word_interner);
|
||||||
|
writeln!(file, "\"{phrase_str}\" : synonym").unwrap();
|
||||||
}
|
}
|
||||||
if *use_prefix_db {
|
if *use_prefix_db {
|
||||||
writeln!(file, "use prefix DB : true").unwrap();
|
writeln!(file, "use prefix DB : true").unwrap();
|
||||||
@ -398,20 +407,20 @@ shape: class").unwrap();
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) {
|
fn query_graph_d2_description(ctx: &mut SearchContext, query_graph: &QueryGraph, file: &mut File) {
|
||||||
writeln!(file,"direction: right").unwrap();
|
writeln!(file,"direction: right").unwrap();
|
||||||
for node in 0..query_graph.nodes.len() {
|
for node in 0..query_graph.nodes.len() {
|
||||||
if matches!(query_graph.nodes[node], QueryNode::Deleted) {
|
if matches!(query_graph.nodes[node], QueryNode::Deleted) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file);
|
Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file);
|
||||||
|
|
||||||
for edge in query_graph.edges[node].successors.iter() {
|
for edge in query_graph.edges[node].successors.iter() {
|
||||||
writeln!(file, "{node} -> {edge};\n").unwrap();
|
writeln!(file, "{node} -> {edge};\n").unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
|
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
|
||||||
writeln!(file,"direction: right").unwrap();
|
writeln!(file,"direction: right").unwrap();
|
||||||
|
|
||||||
writeln!(file, "Proximity Graph {{").unwrap();
|
writeln!(file, "Proximity Graph {{").unwrap();
|
||||||
@ -420,7 +429,7 @@ shape: class").unwrap();
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let distances = &distances[node_idx];
|
let distances = &distances[node_idx];
|
||||||
Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file);
|
Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file);
|
||||||
}
|
}
|
||||||
for edge in graph.all_edges.iter().flatten() {
|
for edge in graph.all_edges.iter().flatten() {
|
||||||
let Edge { from_node, to_node, details, .. } = edge;
|
let Edge { from_node, to_node, details, .. } = edge;
|
||||||
@ -449,7 +458,7 @@ shape: class").unwrap();
|
|||||||
|
|
||||||
|
|
||||||
writeln!(file, "Shortest Paths {{").unwrap();
|
writeln!(file, "Shortest Paths {{").unwrap();
|
||||||
Self::paths_d2_description(graph, paths, file);
|
Self::paths_d2_description(ctx, graph, paths, file);
|
||||||
writeln!(file, "}}").unwrap();
|
writeln!(file, "}}").unwrap();
|
||||||
|
|
||||||
// writeln!(file, "Empty Edge Couples {{").unwrap();
|
// writeln!(file, "Empty Edge Couples {{").unwrap();
|
||||||
@ -468,15 +477,18 @@ shape: class").unwrap();
|
|||||||
// }
|
// }
|
||||||
// writeln!(file, "}}").unwrap();
|
// writeln!(file, "}}").unwrap();
|
||||||
}
|
}
|
||||||
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
|
fn edge_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext,graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
|
||||||
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
|
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
|
||||||
let from_node = &graph.query_graph.nodes[*from_node as usize];
|
let from_node = &graph.query_graph.nodes[*from_node as usize];
|
||||||
let from_node_desc = match from_node {
|
let from_node_desc = match from_node {
|
||||||
QueryNode::Term(term) => match &term.value {
|
QueryNode::Term(term) => match &term.value {
|
||||||
QueryTerm::Phrase { phrase } => {
|
QueryTerm::Phrase { phrase } => {
|
||||||
phrase.description()
|
let phrase = ctx.phrase_interner.get(*phrase);
|
||||||
|
phrase.description(&ctx.word_interner)
|
||||||
|
},
|
||||||
|
QueryTerm::Word { derivations } => {
|
||||||
|
ctx.word_interner.get(derivations.original).to_owned()
|
||||||
},
|
},
|
||||||
QueryTerm::Word { derivations } => derivations.original.clone(),
|
|
||||||
},
|
},
|
||||||
QueryNode::Deleted => panic!(),
|
QueryNode::Deleted => panic!(),
|
||||||
QueryNode::Start => "START".to_owned(),
|
QueryNode::Start => "START".to_owned(),
|
||||||
@ -485,8 +497,11 @@ shape: class").unwrap();
|
|||||||
let to_node = &graph.query_graph.nodes[*to_node as usize];
|
let to_node = &graph.query_graph.nodes[*to_node as usize];
|
||||||
let to_node_desc = match to_node {
|
let to_node_desc = match to_node {
|
||||||
QueryNode::Term(term) => match &term.value {
|
QueryNode::Term(term) => match &term.value {
|
||||||
QueryTerm::Phrase { phrase } => phrase.description(),
|
QueryTerm::Phrase { phrase } => {
|
||||||
QueryTerm::Word { derivations } => derivations.original.clone(),
|
let phrase = ctx.phrase_interner.get(*phrase);
|
||||||
|
phrase.description(&ctx.word_interner)
|
||||||
|
},
|
||||||
|
QueryTerm::Word { derivations } => ctx.word_interner.get(derivations.original).to_owned(),
|
||||||
},
|
},
|
||||||
QueryNode::Deleted => panic!(),
|
QueryNode::Deleted => panic!(),
|
||||||
QueryNode::Start => "START".to_owned(),
|
QueryNode::Start => "START".to_owned(),
|
||||||
@ -496,11 +511,11 @@ shape: class").unwrap();
|
|||||||
shape: class
|
shape: class
|
||||||
}}").unwrap();
|
}}").unwrap();
|
||||||
}
|
}
|
||||||
fn paths_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
|
fn paths_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
|
||||||
for (path_idx, edge_indexes) in paths.iter().enumerate() {
|
for (path_idx, edge_indexes) in paths.iter().enumerate() {
|
||||||
writeln!(file, "{path_idx} {{").unwrap();
|
writeln!(file, "{path_idx} {{").unwrap();
|
||||||
for edge_idx in edge_indexes.iter() {
|
for edge_idx in edge_indexes.iter() {
|
||||||
Self::edge_d2_description(graph, *edge_idx, file);
|
Self::edge_d2_description(ctx, graph, *edge_idx, file);
|
||||||
}
|
}
|
||||||
for couple_edges in edge_indexes.windows(2) {
|
for couple_edges in edge_indexes.windows(2) {
|
||||||
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
mod db_cache;
|
mod db_cache;
|
||||||
mod graph_based_ranking_rule;
|
mod graph_based_ranking_rule;
|
||||||
|
mod interner;
|
||||||
mod logger;
|
mod logger;
|
||||||
mod query_graph;
|
mod query_graph;
|
||||||
mod query_term;
|
mod query_term;
|
||||||
@ -26,7 +27,9 @@ use query_graph::{QueryGraph, QueryNode};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use self::{
|
use self::{
|
||||||
|
interner::Interner,
|
||||||
logger::SearchLogger,
|
logger::SearchLogger,
|
||||||
|
query_term::Phrase,
|
||||||
resolve_query_graph::{resolve_query_graph, NodeDocIdsCache},
|
resolve_query_graph::{resolve_query_graph, NodeDocIdsCache},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -35,14 +38,32 @@ pub enum BitmapOrAllRef<'s> {
|
|||||||
All,
|
All,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct SearchContext<'search> {
|
||||||
|
pub index: &'search Index,
|
||||||
|
pub txn: &'search RoTxn<'search>,
|
||||||
|
pub db_cache: DatabaseCache<'search>,
|
||||||
|
pub word_interner: Interner<String>,
|
||||||
|
pub phrase_interner: Interner<Phrase>,
|
||||||
|
pub node_docids_cache: NodeDocIdsCache,
|
||||||
|
}
|
||||||
|
impl<'search> SearchContext<'search> {
|
||||||
|
pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self {
|
||||||
|
Self {
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
db_cache: <_>::default(),
|
||||||
|
word_interner: <_>::default(),
|
||||||
|
phrase_interner: <_>::default(),
|
||||||
|
node_docids_cache: <_>::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn resolve_maximally_reduced_query_graph<'transaction>(
|
pub fn resolve_maximally_reduced_query_graph<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction heed::RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
node_docids_cache: &mut NodeDocIdsCache,
|
|
||||||
matching_strategy: TermsMatchingStrategy,
|
matching_strategy: TermsMatchingStrategy,
|
||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
@ -73,16 +94,14 @@ pub fn resolve_maximally_reduced_query_graph<'transaction>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger.query_for_universe(&graph);
|
logger.query_for_universe(&graph);
|
||||||
let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?;
|
let docids = resolve_query_graph(ctx, &graph, universe)?;
|
||||||
|
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn execute_search<'transaction>(
|
pub fn execute_search<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
query: &str,
|
query: &str,
|
||||||
filters: Option<Filter>,
|
filters: Option<Filter>,
|
||||||
from: usize,
|
from: usize,
|
||||||
@ -90,26 +109,21 @@ pub fn execute_search<'transaction>(
|
|||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) -> Result<Vec<u32>> {
|
) -> Result<Vec<u32>> {
|
||||||
assert!(!query.is_empty());
|
assert!(!query.is_empty());
|
||||||
let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap();
|
let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap();
|
||||||
let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?;
|
let graph = QueryGraph::from_query(ctx, query_terms)?;
|
||||||
|
|
||||||
logger.initial_query(&graph);
|
logger.initial_query(&graph);
|
||||||
|
|
||||||
let universe = if let Some(filters) = filters {
|
let universe = if let Some(filters) = filters {
|
||||||
filters.evaluate(txn, index)?
|
filters.evaluate(ctx.txn, ctx.index)?
|
||||||
} else {
|
} else {
|
||||||
index.documents_ids(txn)?
|
ctx.index.documents_ids(ctx.txn)?
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut node_docids_cache = NodeDocIdsCache::default();
|
|
||||||
|
|
||||||
let universe = resolve_maximally_reduced_query_graph(
|
let universe = resolve_maximally_reduced_query_graph(
|
||||||
index,
|
ctx,
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
&universe,
|
&universe,
|
||||||
&graph,
|
&graph,
|
||||||
&mut node_docids_cache,
|
|
||||||
TermsMatchingStrategy::Last,
|
TermsMatchingStrategy::Last,
|
||||||
logger,
|
logger,
|
||||||
)?;
|
)?;
|
||||||
@ -117,5 +131,5 @@ pub fn execute_search<'transaction>(
|
|||||||
|
|
||||||
logger.initial_universe(&universe);
|
logger.initial_universe(&universe);
|
||||||
|
|
||||||
apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger)
|
apply_ranking_rules(ctx, &graph, &universe, from, length, logger)
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
|
use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
|
use super::SearchContext;
|
||||||
|
use crate::Result;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
|
||||||
use heed::RoTxn;
|
#[derive(Clone)]
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
|
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum QueryNode {
|
pub enum QueryNode {
|
||||||
Term(LocatedQueryTerm),
|
Term(LocatedQueryTerm),
|
||||||
Deleted,
|
Deleted,
|
||||||
@ -22,7 +19,7 @@ pub struct Edges {
|
|||||||
pub successors: RoaringBitmap,
|
pub successors: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub struct QueryGraph {
|
pub struct QueryGraph {
|
||||||
pub root_node: u32,
|
pub root_node: u32,
|
||||||
pub end_node: u32,
|
pub end_node: u32,
|
||||||
@ -31,8 +28,8 @@ pub struct QueryGraph {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn _assert_sizes() {
|
fn _assert_sizes() {
|
||||||
// TODO: QueryNodes are too big now, 184B is an unreasonable size
|
// TODO: QueryNodes are too big now, 88B is a bit too big
|
||||||
let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
|
let _: [u8; 88] = [0; std::mem::size_of::<QueryNode>()];
|
||||||
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
|
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,19 +69,14 @@ impl QueryGraph {
|
|||||||
|
|
||||||
impl QueryGraph {
|
impl QueryGraph {
|
||||||
// TODO: return the list of all matching words here as well
|
// TODO: return the list of all matching words here as well
|
||||||
pub fn from_query<'transaction>(
|
pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
|
||||||
index: &Index,
|
|
||||||
txn: &RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
terms: Vec<LocatedQueryTerm>,
|
|
||||||
) -> Result<QueryGraph> {
|
|
||||||
// TODO: maybe empty nodes should not be removed here, to compute
|
// TODO: maybe empty nodes should not be removed here, to compute
|
||||||
// the score of the `words` ranking rule correctly
|
// the score of the `words` ranking rule correctly
|
||||||
// it is very easy to traverse the graph and remove afterwards anyway
|
// it is very easy to traverse the graph and remove afterwards anyway
|
||||||
// Still, I'm keeping this here as a demo
|
// Still, I'm keeping this here as a demo
|
||||||
let mut empty_nodes = vec![];
|
let mut empty_nodes = vec![];
|
||||||
|
|
||||||
let word_set = index.words_fst(txn)?;
|
let word_set = ctx.index.words_fst(ctx.txn)?;
|
||||||
let mut graph = QueryGraph::default();
|
let mut graph = QueryGraph::default();
|
||||||
|
|
||||||
let (mut prev2, mut prev1, mut prev0): (Vec<u32>, Vec<u32>, Vec<u32>) =
|
let (mut prev2, mut prev1, mut prev0): (Vec<u32>, Vec<u32>, Vec<u32>) =
|
||||||
@ -105,19 +97,19 @@ impl QueryGraph {
|
|||||||
|
|
||||||
if !prev1.is_empty() {
|
if !prev1.is_empty() {
|
||||||
if let Some((ngram2_str, ngram2_pos)) =
|
if let Some((ngram2_str, ngram2_pos)) =
|
||||||
query_term::ngram2(&query[length - 2], &query[length - 1])
|
query_term::ngram2(ctx, &query[length - 2], &query[length - 1])
|
||||||
{
|
{
|
||||||
if word_set.contains(ngram2_str.as_bytes()) {
|
if word_set.contains(ctx.word_interner.get(ngram2_str)) {
|
||||||
let ngram2 = LocatedQueryTerm {
|
let ngram2 = LocatedQueryTerm {
|
||||||
value: QueryTerm::Word {
|
value: QueryTerm::Word {
|
||||||
derivations: WordDerivations {
|
derivations: WordDerivations {
|
||||||
original: ngram2_str.clone(),
|
original: ngram2_str,
|
||||||
// TODO: could add a typo if it's an ngram?
|
// TODO: could add a typo if it's an ngram?
|
||||||
zero_typo: vec![ngram2_str],
|
zero_typo: Box::new([ngram2_str]),
|
||||||
one_typo: vec![],
|
one_typo: Box::new([]),
|
||||||
two_typos: vec![],
|
two_typos: Box::new([]),
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
synonyms: vec![], // TODO: ngram synonyms
|
synonyms: Box::new([]), // TODO: ngram synonyms
|
||||||
split_words: None, // TODO: maybe ngram split words?
|
split_words: None, // TODO: maybe ngram split words?
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -129,20 +121,23 @@ impl QueryGraph {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !prev2.is_empty() {
|
if !prev2.is_empty() {
|
||||||
if let Some((ngram3_str, ngram3_pos)) =
|
if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3(
|
||||||
query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1])
|
ctx,
|
||||||
{
|
&query[length - 3],
|
||||||
if word_set.contains(ngram3_str.as_bytes()) {
|
&query[length - 2],
|
||||||
|
&query[length - 1],
|
||||||
|
) {
|
||||||
|
if word_set.contains(ctx.word_interner.get(ngram3_str)) {
|
||||||
let ngram3 = LocatedQueryTerm {
|
let ngram3 = LocatedQueryTerm {
|
||||||
value: QueryTerm::Word {
|
value: QueryTerm::Word {
|
||||||
derivations: WordDerivations {
|
derivations: WordDerivations {
|
||||||
original: ngram3_str.clone(),
|
original: ngram3_str,
|
||||||
// TODO: could add a typo if it's an ngram?
|
// TODO: could add a typo if it's an ngram?
|
||||||
zero_typo: vec![ngram3_str],
|
zero_typo: Box::new([ngram3_str]),
|
||||||
one_typo: vec![],
|
one_typo: Box::new([]),
|
||||||
two_typos: vec![],
|
two_typos: Box::new([]),
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
synonyms: vec![], // TODO: ngram synonyms
|
synonyms: Box::new([]), // TODO: ngram synonyms
|
||||||
split_words: None, // TODO: maybe ngram split words?
|
split_words: None, // TODO: maybe ngram split words?
|
||||||
// would be nice for typos like su nflower
|
// would be nice for typos like su nflower
|
||||||
},
|
},
|
||||||
|
@ -16,30 +16,35 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
|||||||
use crate::search::{build_dfa, get_first};
|
use crate::search::{build_dfa, get_first};
|
||||||
use crate::{CboRoaringBitmapLenCodec, Index, Result};
|
use crate::{CboRoaringBitmapLenCodec, Index, Result};
|
||||||
|
|
||||||
#[derive(Debug, Default, Clone)]
|
use super::interner::{Interned, Interner};
|
||||||
|
use super::SearchContext;
|
||||||
|
|
||||||
|
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct Phrase {
|
pub struct Phrase {
|
||||||
pub words: Vec<Option<String>>,
|
pub words: Vec<Option<Interned<String>>>,
|
||||||
}
|
}
|
||||||
impl Phrase {
|
impl Phrase {
|
||||||
pub fn description(&self) -> String {
|
pub fn description(&self, interner: &Interner<String>) -> String {
|
||||||
self.words.iter().flatten().join(" ")
|
self.words.iter().flatten().map(|w| interner.get(*w)).join(" ")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub struct WordDerivations {
|
pub struct WordDerivations {
|
||||||
pub original: String,
|
pub original: Interned<String>,
|
||||||
// TODO: pub prefix_of: Vec<String>,
|
// TODO: pub prefix_of: Vec<String>,
|
||||||
pub synonyms: Vec<Phrase>,
|
pub synonyms: Box<[Interned<Phrase>]>,
|
||||||
pub split_words: Option<(String, String)>,
|
pub split_words: Option<Interned<Phrase>>,
|
||||||
pub zero_typo: Vec<String>,
|
pub zero_typo: Box<[Interned<String>]>,
|
||||||
pub one_typo: Vec<String>,
|
pub one_typo: Box<[Interned<String>]>,
|
||||||
pub two_typos: Vec<String>,
|
pub two_typos: Box<[Interned<String>]>,
|
||||||
pub use_prefix_db: bool,
|
pub use_prefix_db: bool,
|
||||||
}
|
}
|
||||||
impl WordDerivations {
|
impl WordDerivations {
|
||||||
pub fn all_derivations_except_prefix_db(&self) -> impl Iterator<Item = &String> + Clone {
|
pub fn all_derivations_except_prefix_db(
|
||||||
self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter())
|
&'_ self,
|
||||||
|
) -> impl Iterator<Item = Interned<String>> + Clone + '_ {
|
||||||
|
self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied()
|
||||||
}
|
}
|
||||||
fn is_empty(&self) -> bool {
|
fn is_empty(&self) -> bool {
|
||||||
self.zero_typo.is_empty()
|
self.zero_typo.is_empty()
|
||||||
@ -50,15 +55,21 @@ impl WordDerivations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn word_derivations(
|
pub fn word_derivations(
|
||||||
index: &Index,
|
ctx: &mut SearchContext,
|
||||||
txn: &RoTxn,
|
|
||||||
word: &str,
|
word: &str,
|
||||||
max_typo: u8,
|
max_typo: u8,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
fst: &fst::Set<Cow<[u8]>>,
|
fst: &fst::Set<Cow<[u8]>>,
|
||||||
) -> Result<WordDerivations> {
|
) -> Result<WordDerivations> {
|
||||||
|
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||||
|
|
||||||
let use_prefix_db = is_prefix
|
let use_prefix_db = is_prefix
|
||||||
&& index.word_prefix_docids.remap_data_type::<DecodeIgnore>().get(txn, word)?.is_some();
|
&& ctx
|
||||||
|
.index
|
||||||
|
.word_prefix_docids
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.get(ctx.txn, word)?
|
||||||
|
.is_some();
|
||||||
|
|
||||||
let mut zero_typo = vec![];
|
let mut zero_typo = vec![];
|
||||||
let mut one_typo = vec![];
|
let mut one_typo = vec![];
|
||||||
@ -70,11 +81,12 @@ pub fn word_derivations(
|
|||||||
let mut stream = fst.search(prefix).into_stream();
|
let mut stream = fst.search(prefix).into_stream();
|
||||||
|
|
||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = std::str::from_utf8(word)?;
|
let word = std::str::from_utf8(word)?.to_owned();
|
||||||
zero_typo.push(word.to_string());
|
let word_interned = ctx.word_interner.insert(word);
|
||||||
|
zero_typo.push(word_interned);
|
||||||
}
|
}
|
||||||
} else if fst.contains(word) {
|
} else if fst.contains(word) {
|
||||||
zero_typo.push(word.to_string());
|
zero_typo.push(word_interned);
|
||||||
}
|
}
|
||||||
} else if max_typo == 1 {
|
} else if max_typo == 1 {
|
||||||
let dfa = build_dfa(word, 1, is_prefix);
|
let dfa = build_dfa(word, 1, is_prefix);
|
||||||
@ -83,13 +95,14 @@ pub fn word_derivations(
|
|||||||
|
|
||||||
while let Some((word, state)) = stream.next() {
|
while let Some((word, state)) = stream.next() {
|
||||||
let word = std::str::from_utf8(word)?;
|
let word = std::str::from_utf8(word)?;
|
||||||
|
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||||
let d = dfa.distance(state.1);
|
let d = dfa.distance(state.1);
|
||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => {
|
0 => {
|
||||||
zero_typo.push(word.to_string());
|
zero_typo.push(word_interned);
|
||||||
}
|
}
|
||||||
1 => {
|
1 => {
|
||||||
one_typo.push(word.to_string());
|
one_typo.push(word_interned);
|
||||||
}
|
}
|
||||||
_ => panic!(),
|
_ => panic!(),
|
||||||
}
|
}
|
||||||
@ -105,47 +118,56 @@ pub fn word_derivations(
|
|||||||
|
|
||||||
while let Some((found_word, state)) = stream.next() {
|
while let Some((found_word, state)) = stream.next() {
|
||||||
let found_word = std::str::from_utf8(found_word)?;
|
let found_word = std::str::from_utf8(found_word)?;
|
||||||
|
let found_word_interned = ctx.word_interner.insert(found_word.to_owned());
|
||||||
// in the case the typo is on the first letter, we know the number of typo
|
// in the case the typo is on the first letter, we know the number of typo
|
||||||
// is two
|
// is two
|
||||||
if get_first(found_word) != get_first(word) {
|
if get_first(found_word) != get_first(word) {
|
||||||
two_typos.push(found_word.to_string());
|
two_typos.push(found_word_interned);
|
||||||
} else {
|
} else {
|
||||||
// Else, we know that it is the second dfa that matched and compute the
|
// Else, we know that it is the second dfa that matched and compute the
|
||||||
// correct distance
|
// correct distance
|
||||||
let d = second_dfa.distance((state.1).0);
|
let d = second_dfa.distance((state.1).0);
|
||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => {
|
0 => {
|
||||||
zero_typo.push(found_word.to_string());
|
zero_typo.push(found_word_interned);
|
||||||
}
|
}
|
||||||
1 => {
|
1 => {
|
||||||
one_typo.push(found_word.to_string());
|
one_typo.push(found_word_interned);
|
||||||
}
|
}
|
||||||
2 => {
|
2 => {
|
||||||
two_typos.push(found_word.to_string());
|
two_typos.push(found_word_interned);
|
||||||
}
|
}
|
||||||
_ => panic!(),
|
_ => panic!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let split_words = split_best_frequency(index, txn, word)?;
|
let split_words = split_best_frequency(ctx.index, ctx.txn, word)?.map(|(l, r)| {
|
||||||
|
ctx.phrase_interner.insert(Phrase {
|
||||||
|
words: vec![Some(ctx.word_interner.insert(l)), Some(ctx.word_interner.insert(r))],
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
|
|
||||||
let synonyms = index.synonyms(txn)?;
|
|
||||||
let synonyms = synonyms
|
let synonyms = synonyms
|
||||||
.get(&vec![word.to_owned()])
|
.get(&vec![word.to_owned()])
|
||||||
.cloned()
|
.cloned()
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|words| Phrase { words: words.into_iter().map(Some).collect() })
|
.map(|words| {
|
||||||
|
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||||
|
ctx.phrase_interner.insert(Phrase { words })
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
Ok(WordDerivations {
|
Ok(WordDerivations {
|
||||||
original: word.to_owned(),
|
original: ctx.word_interner.insert(word.to_owned()),
|
||||||
synonyms,
|
synonyms,
|
||||||
split_words,
|
split_words,
|
||||||
zero_typo,
|
zero_typo: zero_typo.into_boxed_slice(),
|
||||||
one_typo,
|
one_typo: one_typo.into_boxed_slice(),
|
||||||
two_typos,
|
two_typos: two_typos.into_boxed_slice(),
|
||||||
use_prefix_db,
|
use_prefix_db,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -176,33 +198,36 @@ fn split_best_frequency(
|
|||||||
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
|
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub enum QueryTerm {
|
pub enum QueryTerm {
|
||||||
// TODO: should there be SplitWord, NGram2, and NGram3 variants?
|
// TODO: should there be SplitWord, NGram2, and NGram3 variants?
|
||||||
// NGram2 can have 1 typo and synonyms
|
// NGram2 can have 1 typo and synonyms
|
||||||
// NGram3 cannot have typos but can have synonyms
|
// NGram3 cannot have typos but can have synonyms
|
||||||
// SplitWords are a phrase
|
// SplitWords are a phrase
|
||||||
// Can NGrams be prefixes?
|
// Can NGrams be prefixes?
|
||||||
Phrase { phrase: Phrase },
|
Phrase { phrase: Interned<Phrase> },
|
||||||
Word { derivations: WordDerivations },
|
Word { derivations: WordDerivations },
|
||||||
}
|
}
|
||||||
|
|
||||||
impl QueryTerm {
|
impl QueryTerm {
|
||||||
pub fn original_single_word(&self) -> Option<&str> {
|
pub fn original_single_word<'interner>(
|
||||||
|
&self,
|
||||||
|
word_interner: &'interner Interner<String>,
|
||||||
|
) -> Option<&'interner str> {
|
||||||
match self {
|
match self {
|
||||||
QueryTerm::Phrase { phrase: _ } => None,
|
QueryTerm::Phrase { phrase: _ } => None,
|
||||||
QueryTerm::Word { derivations } => {
|
QueryTerm::Word { derivations } => {
|
||||||
if derivations.is_empty() {
|
if derivations.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(derivations.original.as_str())
|
Some(word_interner.get(derivations.original))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub struct LocatedQueryTerm {
|
pub struct LocatedQueryTerm {
|
||||||
pub value: QueryTerm,
|
pub value: QueryTerm,
|
||||||
pub positions: RangeInclusive<i8>,
|
pub positions: RangeInclusive<i8>,
|
||||||
@ -217,18 +242,17 @@ impl LocatedQueryTerm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn located_query_terms_from_string<'transaction>(
|
pub fn located_query_terms_from_string<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
query: NormalizedTokenIter<Vec<u8>>,
|
query: NormalizedTokenIter<Vec<u8>>,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
) -> Result<Vec<LocatedQueryTerm>> {
|
) -> Result<Vec<LocatedQueryTerm>> {
|
||||||
let authorize_typos = index.authorize_typos(txn)?;
|
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
|
||||||
let min_len_one_typo = index.min_word_len_one_typo(txn)?;
|
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
|
||||||
let min_len_two_typos = index.min_word_len_two_typos(txn)?;
|
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
|
||||||
|
|
||||||
let exact_words = index.exact_words(txn)?;
|
let exact_words = ctx.index.exact_words(ctx.txn)?;
|
||||||
let fst = index.words_fst(txn)?;
|
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||||
|
|
||||||
let nbr_typos = |word: &str| {
|
let nbr_typos = |word: &str| {
|
||||||
if !authorize_typos
|
if !authorize_typos
|
||||||
@ -243,10 +267,6 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let derivations = |word: &str, is_prefix: bool| {
|
|
||||||
word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst)
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut primitive_query = Vec::new();
|
let mut primitive_query = Vec::new();
|
||||||
let mut phrase = Vec::new();
|
let mut phrase = Vec::new();
|
||||||
|
|
||||||
@ -279,14 +299,17 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
if let TokenKind::StopWord = token.kind {
|
if let TokenKind::StopWord = token.kind {
|
||||||
phrase.push(None);
|
phrase.push(None);
|
||||||
} else {
|
} else {
|
||||||
|
let word = ctx.word_interner.insert(token.lemma().to_string());
|
||||||
// TODO: in a phrase, check that every word exists
|
// TODO: in a phrase, check that every word exists
|
||||||
// otherwise return WordDerivations::Empty
|
// otherwise return WordDerivations::Empty
|
||||||
phrase.push(Some(token.lemma().to_string()));
|
phrase.push(Some(word));
|
||||||
}
|
}
|
||||||
} else if peekable.peek().is_some() {
|
} else if peekable.peek().is_some() {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word => {
|
TokenKind::Word => {
|
||||||
let derivations = derivations(token.lemma(), false)?;
|
let word = token.lemma();
|
||||||
|
let derivations =
|
||||||
|
word_derivations(ctx, word, nbr_typos(word), false, &fst)?;
|
||||||
let located_term = LocatedQueryTerm {
|
let located_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Word { derivations },
|
value: QueryTerm::Word { derivations },
|
||||||
positions: position..=position,
|
positions: position..=position,
|
||||||
@ -296,7 +319,8 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
|
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let derivations = derivations(token.lemma(), true)?;
|
let word = token.lemma();
|
||||||
|
let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?;
|
||||||
let located_term = LocatedQueryTerm {
|
let located_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Word { derivations },
|
value: QueryTerm::Word { derivations },
|
||||||
positions: position..=position,
|
positions: position..=position,
|
||||||
@ -323,7 +347,9 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
{
|
{
|
||||||
let located_query_term = LocatedQueryTerm {
|
let located_query_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Phrase {
|
value: QueryTerm::Phrase {
|
||||||
phrase: Phrase { words: mem::take(&mut phrase) },
|
phrase: ctx
|
||||||
|
.phrase_interner
|
||||||
|
.insert(Phrase { words: mem::take(&mut phrase) }),
|
||||||
},
|
},
|
||||||
positions: phrase_start..=phrase_end,
|
positions: phrase_start..=phrase_end,
|
||||||
};
|
};
|
||||||
@ -337,7 +363,9 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||||
if !phrase.is_empty() {
|
if !phrase.is_empty() {
|
||||||
let located_query_term = LocatedQueryTerm {
|
let located_query_term = LocatedQueryTerm {
|
||||||
value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
|
value: QueryTerm::Phrase {
|
||||||
|
phrase: ctx.phrase_interner.insert(Phrase { words: mem::take(&mut phrase) }),
|
||||||
|
},
|
||||||
positions: phrase_start..=phrase_end,
|
positions: phrase_start..=phrase_end,
|
||||||
};
|
};
|
||||||
primitive_query.push(located_query_term);
|
primitive_query.push(located_query_term);
|
||||||
@ -347,35 +375,49 @@ pub fn located_query_terms_from_string<'transaction>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: return a word derivations instead?
|
// TODO: return a word derivations instead?
|
||||||
pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive<i8>)> {
|
pub fn ngram2(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
x: &LocatedQueryTerm,
|
||||||
|
y: &LocatedQueryTerm,
|
||||||
|
) -> Option<(Interned<String>, RangeInclusive<i8>)> {
|
||||||
if *x.positions.end() != y.positions.start() - 1 {
|
if *x.positions.end() != y.positions.start() - 1 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
match (&x.value.original_single_word(), &y.value.original_single_word()) {
|
match (
|
||||||
|
&x.value.original_single_word(&ctx.word_interner),
|
||||||
|
&y.value.original_single_word(&ctx.word_interner),
|
||||||
|
) {
|
||||||
(Some(w1), Some(w2)) => {
|
(Some(w1), Some(w2)) => {
|
||||||
let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
|
let term = (
|
||||||
|
ctx.word_interner.insert(format!("{w1}{w2}")),
|
||||||
|
*x.positions.start()..=*y.positions.end(),
|
||||||
|
);
|
||||||
Some(term)
|
Some(term)
|
||||||
}
|
}
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn ngram3(
|
pub fn ngram3(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
x: &LocatedQueryTerm,
|
x: &LocatedQueryTerm,
|
||||||
y: &LocatedQueryTerm,
|
y: &LocatedQueryTerm,
|
||||||
z: &LocatedQueryTerm,
|
z: &LocatedQueryTerm,
|
||||||
) -> Option<(String, RangeInclusive<i8>)> {
|
) -> Option<(Interned<String>, RangeInclusive<i8>)> {
|
||||||
if *x.positions.end() != y.positions.start() - 1
|
if *x.positions.end() != y.positions.start() - 1
|
||||||
|| *y.positions.end() != z.positions.start() - 1
|
|| *y.positions.end() != z.positions.start() - 1
|
||||||
{
|
{
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
match (
|
match (
|
||||||
&x.value.original_single_word(),
|
&x.value.original_single_word(&ctx.word_interner),
|
||||||
&y.value.original_single_word(),
|
&y.value.original_single_word(&ctx.word_interner),
|
||||||
&z.value.original_single_word(),
|
&z.value.original_single_word(&ctx.word_interner),
|
||||||
) {
|
) {
|
||||||
(Some(w1), Some(w2), Some(w3)) => {
|
(Some(w1), Some(w2), Some(w3)) => {
|
||||||
let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
|
let term = (
|
||||||
|
ctx.word_interner.insert(format!("{w1}{w2}{w3}")),
|
||||||
|
*x.positions.start()..=*z.positions.end(),
|
||||||
|
);
|
||||||
Some(term)
|
Some(term)
|
||||||
}
|
}
|
||||||
_ => None,
|
_ => None,
|
||||||
|
@ -1,18 +1,10 @@
|
|||||||
use heed::RoTxn;
|
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
|
use crate::new::{QueryGraph, SearchContext};
|
||||||
|
use crate::Result;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::new::QueryGraph;
|
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
pub fn build<'db_cache, 'transaction: 'db_cache>(
|
pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result<Self> {
|
||||||
index: &Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
query_graph: QueryGraph,
|
|
||||||
) -> Result<Self> {
|
|
||||||
let mut ranking_rule_graph =
|
let mut ranking_rule_graph =
|
||||||
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
|
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
|
||||||
|
|
||||||
@ -22,12 +14,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
|
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
|
||||||
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
|
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
|
||||||
|
|
||||||
let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue };
|
let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue };
|
||||||
|
|
||||||
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
|
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
|
||||||
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
|
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
|
||||||
let mut edges =
|
let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?;
|
||||||
G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?;
|
|
||||||
if edges.is_empty() {
|
if edges.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::{BitmapOrAllRef, SearchContext};
|
||||||
use crate::new::BitmapOrAllRef;
|
use crate::Result;
|
||||||
use crate::{Index, Result};
|
use fxhash::FxHashMap;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
// TODO: the cache should have a G::EdgeDetails as key
|
// TODO: the cache should have a G::EdgeDetails as key
|
||||||
// but then it means that we should have a quick way of
|
// but then it means that we should have a quick way of
|
||||||
@ -25,11 +22,9 @@ impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
|
impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
|
||||||
pub fn get_edge_docids<'s, 'transaction>(
|
pub fn get_edge_docids<'s, 'search>(
|
||||||
&'s mut self,
|
&'s mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge_index: u32,
|
edge_index: u32,
|
||||||
graph: &RankingRuleGraph<G>,
|
graph: &RankingRuleGraph<G>,
|
||||||
// TODO: maybe universe doesn't belong here
|
// TODO: maybe universe doesn't belong here
|
||||||
@ -46,7 +41,7 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
|
|||||||
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
||||||
}
|
}
|
||||||
// TODO: maybe universe doesn't belong here
|
// TODO: maybe universe doesn't belong here
|
||||||
let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
|
let docids = universe & G::compute_docids(ctx, details)?;
|
||||||
let _ = self.cache.insert(edge_index, docids);
|
let _ = self.cache.insert(edge_index, docids);
|
||||||
let docids = &self.cache[&edge_index];
|
let docids = &self.cache[&edge_index];
|
||||||
Ok(BitmapOrAllRef::Bitmap(docids))
|
Ok(BitmapOrAllRef::Bitmap(docids))
|
||||||
|
@ -7,20 +7,15 @@ mod proximity;
|
|||||||
mod resolve_paths;
|
mod resolve_paths;
|
||||||
mod typo;
|
mod typo;
|
||||||
|
|
||||||
|
use super::logger::SearchLogger;
|
||||||
|
use super::{QueryGraph, QueryNode, SearchContext};
|
||||||
|
use crate::Result;
|
||||||
pub use edge_docids_cache::EdgeDocidsCache;
|
pub use edge_docids_cache::EdgeDocidsCache;
|
||||||
pub use empty_paths_cache::EmptyPathsCache;
|
pub use empty_paths_cache::EmptyPathsCache;
|
||||||
pub use proximity::ProximityGraph;
|
pub use proximity::ProximityGraph;
|
||||||
pub use typo::TypoGraph;
|
|
||||||
|
|
||||||
use std::ops::ControlFlow;
|
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
use super::db_cache::DatabaseCache;
|
pub use typo::TypoGraph;
|
||||||
use super::logger::SearchLogger;
|
|
||||||
use super::{QueryGraph, QueryNode};
|
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum EdgeDetails<E> {
|
pub enum EdgeDetails<E> {
|
||||||
@ -42,6 +37,48 @@ pub struct EdgePointer<'graph, E> {
|
|||||||
pub edge: &'graph Edge<E>,
|
pub edge: &'graph Edge<E>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// pub struct SubWordDerivations {
|
||||||
|
// words: FxHashSet<Interned<String>>,
|
||||||
|
// synonyms: FxHashSet<Interned<Phrase>>, // NO! they're phrases, not strings
|
||||||
|
// split_words: bool,
|
||||||
|
// use_prefix_db: bool,
|
||||||
|
// }
|
||||||
|
|
||||||
|
// pub struct EdgeWordDerivations {
|
||||||
|
// // TODO: not Option, instead: Any | All | Subset(SubWordDerivations)
|
||||||
|
// from_words: Option<SubWordDerivations>, // ???
|
||||||
|
// to_words: Option<SubWordDerivations>, // + use prefix db?
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn aggregate_edge_word_derivations(
|
||||||
|
// graph: (),
|
||||||
|
// edges: Vec<usize>,
|
||||||
|
// ) -> BTreeMap<usize, SubWordDerivations> {
|
||||||
|
// todo!()
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn reduce_word_term_to_sub_word_derivations(
|
||||||
|
// term: &mut WordDerivations,
|
||||||
|
// derivations: &SubWordDerivations,
|
||||||
|
// ) {
|
||||||
|
// let mut new_one_typo = vec![];
|
||||||
|
// for w in term.one_typo {
|
||||||
|
// if derivations.words.contains(w) {
|
||||||
|
// new_one_typo.push(w);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if term.use_prefix_db && !derivations.use_prefix_db {
|
||||||
|
// term.use_prefix_db = false;
|
||||||
|
// }
|
||||||
|
// // etc.
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn word_derivations_used_by_edge<G: RankingRuleGraphTrait>(
|
||||||
|
// edge: G::EdgeDetails,
|
||||||
|
// ) -> SubWordDerivations {
|
||||||
|
// todo!()
|
||||||
|
// }
|
||||||
|
|
||||||
pub trait RankingRuleGraphTrait: Sized {
|
pub trait RankingRuleGraphTrait: Sized {
|
||||||
/// The details of an edge connecting two query nodes. These details
|
/// The details of an edge connecting two query nodes. These details
|
||||||
/// should be sufficient to compute the edge's cost and associated document ids
|
/// should be sufficient to compute the edge's cost and associated document ids
|
||||||
@ -55,10 +92,8 @@ pub trait RankingRuleGraphTrait: Sized {
|
|||||||
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
|
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
|
||||||
|
|
||||||
/// Compute the document ids associated with the given edge.
|
/// Compute the document ids associated with the given edge.
|
||||||
fn compute_docids<'transaction>(
|
fn compute_docids<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge_details: &Self::EdgeDetails,
|
edge_details: &Self::EdgeDetails,
|
||||||
) -> Result<RoaringBitmap>;
|
) -> Result<RoaringBitmap>;
|
||||||
|
|
||||||
@ -66,19 +101,15 @@ pub trait RankingRuleGraphTrait: Sized {
|
|||||||
///
|
///
|
||||||
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
|
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
|
||||||
/// which builds the actual edges.
|
/// which builds the actual edges.
|
||||||
fn build_visit_from_node<'transaction>(
|
fn build_visit_from_node<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
from_node: &QueryNode,
|
from_node: &QueryNode,
|
||||||
) -> Result<Option<Self::BuildVisitedFromNode>>;
|
) -> Result<Option<Self::BuildVisitedFromNode>>;
|
||||||
|
|
||||||
/// Return the cost and details of the edges going from the previously visited node
|
/// Return the cost and details of the edges going from the previously visited node
|
||||||
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
|
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
|
||||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
fn build_visit_to_node<'from_data, 'search: 'from_data>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||||
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;
|
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;
|
||||||
|
@ -1,30 +1,30 @@
|
|||||||
use std::collections::BTreeMap;
|
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
use itertools::Itertools;
|
|
||||||
|
|
||||||
use super::ProximityEdge;
|
use super::ProximityEdge;
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
use crate::new::ranking_rule_graph::proximity::WordPair;
|
use crate::new::ranking_rule_graph::proximity::WordPair;
|
||||||
use crate::new::ranking_rule_graph::EdgeDetails;
|
use crate::new::ranking_rule_graph::EdgeDetails;
|
||||||
use crate::new::QueryNode;
|
use crate::new::{QueryNode, SearchContext};
|
||||||
use crate::{Index, Result};
|
use crate::Result;
|
||||||
|
use itertools::Itertools;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
|
pub fn visit_from_node(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
from_node: &QueryNode,
|
||||||
|
) -> Result<Option<(WordDerivations, i8)>> {
|
||||||
Ok(Some(match from_node {
|
Ok(Some(match from_node {
|
||||||
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
|
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
|
||||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
|
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
|
||||||
QueryTerm::Phrase { phrase: phrase1 } => {
|
QueryTerm::Phrase { phrase: phrase1 } => {
|
||||||
if let Some(original) = phrase1.words.last().unwrap().as_ref() {
|
let phrase1 = ctx.phrase_interner.get(*phrase1);
|
||||||
|
if let Some(original) = *phrase1.words.last().unwrap() {
|
||||||
(
|
(
|
||||||
WordDerivations {
|
WordDerivations {
|
||||||
original: original.clone(),
|
original,
|
||||||
zero_typo: vec![original.to_owned()],
|
zero_typo: Box::new([original]),
|
||||||
one_typo: vec![],
|
one_typo: Box::new([]),
|
||||||
two_typos: vec![],
|
two_typos: Box::new([]),
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
synonyms: vec![],
|
synonyms: Box::new([]),
|
||||||
split_words: None,
|
split_words: None,
|
||||||
},
|
},
|
||||||
*pos1.end(),
|
*pos1.end(),
|
||||||
@ -37,12 +37,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
|||||||
},
|
},
|
||||||
QueryNode::Start => (
|
QueryNode::Start => (
|
||||||
WordDerivations {
|
WordDerivations {
|
||||||
original: String::new(),
|
original: ctx.word_interner.insert(String::new()),
|
||||||
zero_typo: vec![],
|
zero_typo: Box::new([]),
|
||||||
one_typo: vec![],
|
one_typo: Box::new([]),
|
||||||
two_typos: vec![],
|
two_typos: Box::new([]),
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
synonyms: vec![],
|
synonyms: Box::new([]),
|
||||||
split_words: None,
|
split_words: None,
|
||||||
},
|
},
|
||||||
-100,
|
-100,
|
||||||
@ -51,10 +51,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn visit_to_node<'transaction, 'from_data>(
|
pub fn visit_to_node<'search, 'from_data>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data (WordDerivations, i8),
|
from_node_data: &'from_data (WordDerivations, i8),
|
||||||
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
|
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
|
||||||
@ -69,15 +67,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
let (derivations2, pos2, ngram_len2) = match value2 {
|
let (derivations2, pos2, ngram_len2) = match value2 {
|
||||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
|
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
|
||||||
QueryTerm::Phrase { phrase: phrase2 } => {
|
QueryTerm::Phrase { phrase: phrase2 } => {
|
||||||
if let Some(original) = phrase2.words.first().unwrap().as_ref() {
|
let phrase2 = ctx.phrase_interner.get(*phrase2);
|
||||||
|
if let Some(original) = *phrase2.words.first().unwrap() {
|
||||||
(
|
(
|
||||||
WordDerivations {
|
WordDerivations {
|
||||||
original: original.clone(),
|
original,
|
||||||
zero_typo: vec![original.to_owned()],
|
zero_typo: Box::new([original]),
|
||||||
one_typo: vec![],
|
one_typo: Box::new([]),
|
||||||
two_typos: vec![],
|
two_typos: Box::new([]),
|
||||||
use_prefix_db: false,
|
use_prefix_db: false,
|
||||||
synonyms: vec![],
|
synonyms: Box::new([]),
|
||||||
split_words: None,
|
split_words: None,
|
||||||
},
|
},
|
||||||
*pos2.start(),
|
*pos2.start(),
|
||||||
@ -106,19 +105,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
|
|
||||||
let derivations1 = derivations1.all_derivations_except_prefix_db();
|
let derivations1 = derivations1.all_derivations_except_prefix_db();
|
||||||
// TODO: eventually, we want to get rid of the uses from `orginal`
|
// TODO: eventually, we want to get rid of the uses from `orginal`
|
||||||
let original_word_2 = derivations2.original.clone();
|
|
||||||
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
|
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
|
||||||
|
|
||||||
if updb2 {
|
if updb2 {
|
||||||
for word1 in derivations1.clone() {
|
for word1 in derivations1.clone() {
|
||||||
for proximity in 1..=(8 - ngram_len2) {
|
for proximity in 1..=(8 - ngram_len2) {
|
||||||
let cost = (proximity + ngram_len2 - 1) as u8;
|
let cost = (proximity + ngram_len2 - 1) as u8;
|
||||||
if db_cache
|
if ctx
|
||||||
.get_word_prefix_pair_proximity_docids(
|
.get_word_prefix_pair_proximity_docids(
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word1,
|
word1,
|
||||||
original_word_2.as_str(),
|
derivations2.original,
|
||||||
proximity as u8,
|
proximity as u8,
|
||||||
)?
|
)?
|
||||||
.is_some()
|
.is_some()
|
||||||
@ -129,16 +125,14 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
.entry(proximity as u8)
|
.entry(proximity as u8)
|
||||||
.or_default()
|
.or_default()
|
||||||
.push(WordPair::WordPrefix {
|
.push(WordPair::WordPrefix {
|
||||||
left: word1.to_owned(),
|
left: word1,
|
||||||
right_prefix: original_word_2.to_owned(),
|
right_prefix: derivations2.original,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if db_cache
|
if ctx
|
||||||
.get_prefix_word_pair_proximity_docids(
|
.get_prefix_word_pair_proximity_docids(
|
||||||
index,
|
derivations2.original,
|
||||||
txn,
|
word1,
|
||||||
original_word_2.as_str(),
|
|
||||||
word1.as_str(),
|
|
||||||
proximity as u8 - 1,
|
proximity as u8 - 1,
|
||||||
)?
|
)?
|
||||||
.is_some()
|
.is_some()
|
||||||
@ -149,8 +143,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
.entry(proximity as u8)
|
.entry(proximity as u8)
|
||||||
.or_default()
|
.or_default()
|
||||||
.push(WordPair::WordPrefixSwapped {
|
.push(WordPair::WordPrefixSwapped {
|
||||||
left_prefix: original_word_2.to_owned(),
|
left_prefix: derivations2.original,
|
||||||
right: word1.to_owned(),
|
right: word1,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -164,28 +158,23 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
for (word1, word2) in product_derivations {
|
for (word1, word2) in product_derivations {
|
||||||
for proximity in 1..=(8 - ngram_len2) {
|
for proximity in 1..=(8 - ngram_len2) {
|
||||||
let cost = (proximity + ngram_len2 - 1) as u8;
|
let cost = (proximity + ngram_len2 - 1) as u8;
|
||||||
if db_cache
|
if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() {
|
||||||
.get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)?
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
cost_proximity_word_pairs
|
cost_proximity_word_pairs
|
||||||
.entry(cost)
|
.entry(cost)
|
||||||
.or_default()
|
.or_default()
|
||||||
.entry(proximity as u8)
|
.entry(proximity as u8)
|
||||||
.or_default()
|
.or_default()
|
||||||
.push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() });
|
.push(WordPair::Words { left: word1, right: word2 });
|
||||||
}
|
}
|
||||||
if proximity > 1
|
if proximity > 1
|
||||||
&& db_cache
|
&& ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some()
|
||||||
.get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)?
|
|
||||||
.is_some()
|
|
||||||
{
|
{
|
||||||
cost_proximity_word_pairs
|
cost_proximity_word_pairs
|
||||||
.entry(cost)
|
.entry(cost)
|
||||||
.or_default()
|
.or_default()
|
||||||
.entry(proximity as u8 - 1)
|
.entry(proximity as u8 - 1)
|
||||||
.or_default()
|
.or_default()
|
||||||
.push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() });
|
.push(WordPair::Words { left: word2, right: word1 });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,10 @@
|
|||||||
use heed::RoTxn;
|
use super::{ProximityEdge, WordPair};
|
||||||
|
use crate::new::SearchContext;
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result};
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use super::{ProximityEdge, WordPair};
|
pub fn compute_docids<'search>(
|
||||||
use crate::new::db_cache::DatabaseCache;
|
ctx: &mut SearchContext<'search>,
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
|
||||||
|
|
||||||
pub fn compute_docids<'transaction>(
|
|
||||||
index: &crate::Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge: &ProximityEdge,
|
edge: &ProximityEdge,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
let ProximityEdge { pairs, proximity } = edge;
|
let ProximityEdge { pairs, proximity } = edge;
|
||||||
@ -16,12 +12,14 @@ pub fn compute_docids<'transaction>(
|
|||||||
for pair in pairs.iter() {
|
for pair in pairs.iter() {
|
||||||
let bytes = match pair {
|
let bytes = match pair {
|
||||||
WordPair::Words { left, right } => {
|
WordPair::Words { left, right } => {
|
||||||
db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity)
|
ctx.get_word_pair_proximity_docids(*left, *right, *proximity)
|
||||||
|
}
|
||||||
|
WordPair::WordPrefix { left, right_prefix } => {
|
||||||
|
ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity)
|
||||||
|
}
|
||||||
|
WordPair::WordPrefixSwapped { left_prefix, right } => {
|
||||||
|
ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity)
|
||||||
}
|
}
|
||||||
WordPair::WordPrefix { left, right_prefix } => db_cache
|
|
||||||
.get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity),
|
|
||||||
WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
|
|
||||||
.get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity),
|
|
||||||
}?;
|
}?;
|
||||||
let bitmap =
|
let bitmap =
|
||||||
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
|
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
|
||||||
|
@ -1,25 +1,22 @@
|
|||||||
pub mod build;
|
pub mod build;
|
||||||
pub mod compute_docids;
|
pub mod compute_docids;
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::empty_paths_cache::EmptyPathsCache;
|
use super::empty_paths_cache::EmptyPathsCache;
|
||||||
|
|
||||||
use super::{EdgeDetails, RankingRuleGraphTrait};
|
use super::{EdgeDetails, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::interner::Interned;
|
||||||
use crate::new::logger::SearchLogger;
|
use crate::new::logger::SearchLogger;
|
||||||
use crate::new::query_term::WordDerivations;
|
use crate::new::query_term::WordDerivations;
|
||||||
use crate::new::{QueryGraph, QueryNode};
|
use crate::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::{Index, Result};
|
use crate::Result;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
// TODO: intern the strings, refer to them by their pointer?
|
// TODO: intern the strings, refer to them by their pointer?
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Clone)]
|
||||||
pub enum WordPair {
|
pub enum WordPair {
|
||||||
Words { left: String, right: String },
|
Words { left: Interned<String>, right: Interned<String> },
|
||||||
WordPrefix { left: String, right_prefix: String },
|
WordPrefix { left: Interned<String>, right_prefix: Interned<String> },
|
||||||
WordPrefixSwapped { left_prefix: String, right: String },
|
WordPrefixSwapped { left_prefix: Interned<String>, right: Interned<String> },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -40,32 +37,26 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
format!(", prox {proximity}, {} pairs", pairs.len())
|
format!(", prox {proximity}, {} pairs", pairs.len())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_docids<'db_cache, 'transaction>(
|
fn compute_docids<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge: &Self::EdgeDetails,
|
edge: &Self::EdgeDetails,
|
||||||
) -> Result<roaring::RoaringBitmap> {
|
) -> Result<roaring::RoaringBitmap> {
|
||||||
compute_docids::compute_docids(index, txn, db_cache, edge)
|
compute_docids::compute_docids(ctx, edge)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_visit_from_node<'transaction>(
|
fn build_visit_from_node<'search>(
|
||||||
_index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
from_node: &QueryNode,
|
from_node: &QueryNode,
|
||||||
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
||||||
build::visit_from_node(from_node)
|
build::visit_from_node(ctx, from_node)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
fn build_visit_to_node<'from_data, 'search: 'from_data>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||||
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
||||||
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
|
build::visit_to_node(ctx, to_node, from_node_data)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn log_state(
|
fn log_state(
|
||||||
|
@ -1,23 +1,18 @@
|
|||||||
#![allow(clippy::too_many_arguments)]
|
#![allow(clippy::too_many_arguments)]
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
|
||||||
|
|
||||||
use super::edge_docids_cache::EdgeDocidsCache;
|
use super::edge_docids_cache::EdgeDocidsCache;
|
||||||
use super::empty_paths_cache::EmptyPathsCache;
|
use super::empty_paths_cache::EmptyPathsCache;
|
||||||
|
|
||||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::{BitmapOrAllRef, SearchContext};
|
||||||
|
use crate::Result;
|
||||||
use crate::new::BitmapOrAllRef;
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
use crate::{Index, Result};
|
|
||||||
|
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
pub fn resolve_paths<'transaction>(
|
// TODO: reduce the universe after computing each path
|
||||||
|
// TODO: deserialize roaring bitmap within a universe
|
||||||
|
pub fn resolve_paths<'search>(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
||||||
empty_paths_cache: &mut EmptyPathsCache,
|
empty_paths_cache: &mut EmptyPathsCache,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
@ -52,8 +47,8 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
let mut cached_edge_docids = vec![];
|
let mut cached_edge_docids = vec![];
|
||||||
'edge_loop: for edge_index in edge_indexes {
|
'edge_loop: for edge_index in edge_indexes {
|
||||||
visited_edges.push(edge_index);
|
visited_edges.push(edge_index);
|
||||||
let edge_docids = edge_docids_cache
|
let edge_docids =
|
||||||
.get_edge_docids(index, txn, db_cache, edge_index, self, universe)?;
|
edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?;
|
||||||
match edge_docids {
|
match edge_docids {
|
||||||
BitmapOrAllRef::Bitmap(edge_docids) => {
|
BitmapOrAllRef::Bitmap(edge_docids) => {
|
||||||
cached_edge_docids.push((edge_index, edge_docids.clone()));
|
cached_edge_docids.push((edge_index, edge_docids.clone()));
|
||||||
|
@ -1,19 +1,17 @@
|
|||||||
use heed::{BytesDecode, RoTxn};
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::empty_paths_cache::EmptyPathsCache;
|
use super::empty_paths_cache::EmptyPathsCache;
|
||||||
|
|
||||||
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::interner::Interned;
|
||||||
use crate::new::logger::SearchLogger;
|
use crate::new::logger::SearchLogger;
|
||||||
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
|
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
|
||||||
use crate::new::resolve_query_graph::resolve_phrase;
|
use crate::new::resolve_query_graph::resolve_phrase;
|
||||||
use crate::new::{QueryGraph, QueryNode};
|
use crate::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{Result, RoaringBitmapCodec};
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum TypoEdge {
|
pub enum TypoEdge {
|
||||||
Phrase { phrase: Phrase },
|
Phrase { phrase: Interned<Phrase> },
|
||||||
Word { derivations: WordDerivations, nbr_typos: u8 },
|
Word { derivations: WordDerivations, nbr_typos: u8 },
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,14 +28,12 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_docids<'db_cache, 'transaction>(
|
fn compute_docids<'db_cache, 'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
edge: &Self::EdgeDetails,
|
edge: &Self::EdgeDetails,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
match edge {
|
match edge {
|
||||||
TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase),
|
TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase),
|
||||||
TypoEdge::Word { derivations, nbr_typos } => {
|
TypoEdge::Word { derivations, nbr_typos } => {
|
||||||
let words = match nbr_typos {
|
let words = match nbr_typos {
|
||||||
0 => &derivations.zero_typo,
|
0 => &derivations.zero_typo,
|
||||||
@ -46,16 +42,14 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
_ => panic!(),
|
_ => panic!(),
|
||||||
};
|
};
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for word in words.iter() {
|
for word in words.iter().copied() {
|
||||||
let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue };
|
let Some(bytes) = ctx.get_word_docids(word)? else { continue };
|
||||||
let bitmap =
|
let bitmap =
|
||||||
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
|
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
|
||||||
docids |= bitmap;
|
docids |= bitmap;
|
||||||
}
|
}
|
||||||
if *nbr_typos == 0 {
|
if *nbr_typos == 0 {
|
||||||
if let Some(bytes) =
|
if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
|
||||||
db_cache.get_prefix_docids(index, txn, &derivations.original)?
|
|
||||||
{
|
|
||||||
let bitmap =
|
let bitmap =
|
||||||
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
|
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
|
||||||
docids |= bitmap;
|
docids |= bitmap;
|
||||||
@ -66,26 +60,22 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_visit_from_node<'transaction>(
|
fn build_visit_from_node<'search>(
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_from_node: &QueryNode,
|
_from_node: &QueryNode,
|
||||||
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
||||||
Ok(Some(()))
|
Ok(Some(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
fn build_visit_to_node<'from_data, 'search: 'from_data>(
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
_from_node_data: &'from_data Self::BuildVisitedFromNode,
|
_from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||||
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
||||||
match to_node {
|
match to_node {
|
||||||
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
|
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
|
||||||
QueryTerm::Phrase { phrase } => {
|
&QueryTerm::Phrase { phrase } => {
|
||||||
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))])
|
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))])
|
||||||
}
|
}
|
||||||
QueryTerm::Word { derivations } => {
|
QueryTerm::Word { derivations } => {
|
||||||
let mut edges = vec![];
|
let mut edges = vec![];
|
||||||
|
@ -1,33 +1,28 @@
|
|||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
|
|
||||||
use super::QueryGraph;
|
use super::QueryGraph;
|
||||||
|
use super::SearchContext;
|
||||||
use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
|
use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
|
||||||
use crate::new::ranking_rule_graph::ProximityGraph;
|
use crate::new::ranking_rule_graph::ProximityGraph;
|
||||||
use crate::new::ranking_rule_graph::TypoGraph;
|
use crate::new::ranking_rule_graph::TypoGraph;
|
||||||
use crate::new::words::Words;
|
use crate::new::words::Words;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
// use crate::search::new::sort::Sort;
|
// use crate::search::new::sort::Sort;
|
||||||
use crate::{Index, Result, TermsMatchingStrategy};
|
use crate::{Result, TermsMatchingStrategy};
|
||||||
|
|
||||||
pub trait RankingRuleOutputIter<'transaction, Query> {
|
pub trait RankingRuleOutputIter<'search, Query> {
|
||||||
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
|
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RankingRuleOutputIterWrapper<'transaction, Query> {
|
pub struct RankingRuleOutputIterWrapper<'search, Query> {
|
||||||
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
|
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'search>,
|
||||||
}
|
}
|
||||||
impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> {
|
impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> {
|
||||||
pub fn new(
|
pub fn new(iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'search>) -> Self {
|
||||||
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
|
|
||||||
) -> Self {
|
|
||||||
Self { iter }
|
Self { iter }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query>
|
impl<'search, Query> RankingRuleOutputIter<'search, Query>
|
||||||
for RankingRuleOutputIterWrapper<'transaction, Query>
|
for RankingRuleOutputIterWrapper<'search, Query>
|
||||||
{
|
{
|
||||||
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
|
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
|
||||||
match self.iter.next() {
|
match self.iter.next() {
|
||||||
@ -44,7 +39,7 @@ pub struct PlaceholderQuery;
|
|||||||
impl RankingRuleQueryTrait for PlaceholderQuery {}
|
impl RankingRuleQueryTrait for PlaceholderQuery {}
|
||||||
impl RankingRuleQueryTrait for QueryGraph {}
|
impl RankingRuleQueryTrait for QueryGraph {}
|
||||||
|
|
||||||
pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
pub trait RankingRule<'search, Query: RankingRuleQueryTrait> {
|
||||||
fn id(&self) -> String;
|
fn id(&self) -> String;
|
||||||
|
|
||||||
/// Prepare the ranking rule such that it can start iterating over its
|
/// Prepare the ranking rule such that it can start iterating over its
|
||||||
@ -53,9 +48,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
|
/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
logger: &mut dyn SearchLogger<Query>,
|
logger: &mut dyn SearchLogger<Query>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
query: &Query,
|
query: &Query,
|
||||||
@ -70,9 +63,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
|
/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
logger: &mut dyn SearchLogger<Query>,
|
logger: &mut dyn SearchLogger<Query>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<Query>>>;
|
) -> Result<Option<RankingRuleOutput<Query>>>;
|
||||||
@ -81,9 +72,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
|
/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
logger: &mut dyn SearchLogger<Query>,
|
logger: &mut dyn SearchLogger<Query>,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -98,11 +87,9 @@ pub struct RankingRuleOutput<Q> {
|
|||||||
|
|
||||||
// TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
|
// TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn apply_ranking_rules<'transaction>(
|
pub fn apply_ranking_rules<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction heed::RoTxn,
|
|
||||||
// TODO: ranking rules parameter
|
// TODO: ranking rules parameter
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
from: usize,
|
from: usize,
|
||||||
@ -115,7 +102,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
|
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
|
||||||
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
|
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
|
||||||
// TODO: ranking rules given as argument
|
// TODO: ranking rules given as argument
|
||||||
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
|
let mut ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> =
|
||||||
vec![words, typo, proximity /*sort*/];
|
vec![words, typo, proximity /*sort*/];
|
||||||
|
|
||||||
logger.ranking_rules(&ranking_rules);
|
logger.ranking_rules(&ranking_rules);
|
||||||
@ -126,7 +113,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
|
|
||||||
let ranking_rules_len = ranking_rules.len();
|
let ranking_rules_len = ranking_rules.len();
|
||||||
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
|
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
|
||||||
ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?;
|
ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;
|
||||||
|
|
||||||
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||||
candidates[0] = universe.clone();
|
candidates[0] = universe.clone();
|
||||||
@ -142,7 +129,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
&candidates[cur_ranking_rule_index],
|
&candidates[cur_ranking_rule_index],
|
||||||
);
|
);
|
||||||
candidates[cur_ranking_rule_index].clear();
|
candidates[cur_ranking_rule_index].clear();
|
||||||
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
|
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
||||||
if cur_ranking_rule_index == 0 {
|
if cur_ranking_rule_index == 0 {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
@ -206,7 +193,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
|
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else {
|
||||||
// TODO: add remaining candidates automatically here?
|
// TODO: add remaining candidates automatically here?
|
||||||
back!();
|
back!();
|
||||||
continue;
|
continue;
|
||||||
@ -239,9 +226,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
&candidates[cur_ranking_rule_index],
|
&candidates[cur_ranking_rule_index],
|
||||||
);
|
);
|
||||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||||
index,
|
ctx,
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
logger,
|
logger,
|
||||||
&next_bucket.candidates,
|
&next_bucket.candidates,
|
||||||
&next_bucket.query,
|
&next_bucket.query,
|
||||||
@ -255,9 +240,7 @@ pub fn apply_ranking_rules<'transaction>(
|
|||||||
mod tests {
|
mod tests {
|
||||||
// use crate::allocator::ALLOC;
|
// use crate::allocator::ALLOC;
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use crate::index::tests::TempIndex;
|
use crate::new::{execute_search, SearchContext};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::new::execute_search;
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
@ -269,55 +252,6 @@ mod tests {
|
|||||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn execute_new_search() {
|
|
||||||
let index = TempIndex::new();
|
|
||||||
index
|
|
||||||
.add_documents(documents!([
|
|
||||||
{
|
|
||||||
"id": 7,
|
|
||||||
"text": "the super quick super brown fox jumps over",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 8,
|
|
||||||
"text": "the super quick brown fox jumps over",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 9,
|
|
||||||
"text": "the quick super brown fox jumps over",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 10,
|
|
||||||
"text": "the quick brown fox jumps over",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 11,
|
|
||||||
"text": "the quick brown fox jumps over the lazy dog",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 12,
|
|
||||||
"text": "the quick brown cat jumps over the lazy dog",
|
|
||||||
},
|
|
||||||
]))
|
|
||||||
.unwrap();
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
|
|
||||||
let results = execute_search(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
"releases from poison by the government",
|
|
||||||
None,
|
|
||||||
0,
|
|
||||||
50,
|
|
||||||
&mut DefaultSearchLogger,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
println!("{results:?}")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn search_wiki_new() {
|
fn search_wiki_new() {
|
||||||
let mut options = EnvOpenOptions::new();
|
let mut options = EnvOpenOptions::new();
|
||||||
@ -331,24 +265,20 @@ mod tests {
|
|||||||
// loop {
|
// loop {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||||
|
|
||||||
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
|
||||||
|
|
||||||
let results = execute_search(
|
let results = execute_search(
|
||||||
&index,
|
&mut SearchContext::new(&index, &txn),
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
"releases from poison by the government",
|
"releases from poison by the government",
|
||||||
None,
|
None,
|
||||||
0,
|
0,
|
||||||
20,
|
20,
|
||||||
// &mut DefaultSearchLogger,
|
&mut DefaultSearchLogger,
|
||||||
&mut logger,
|
// &mut logger,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
logger.write_d2_description();
|
// logger.write_d2_description();
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
let elapsed = start.elapsed();
|
||||||
|
|
||||||
@ -425,19 +355,15 @@ mod tests {
|
|||||||
let index = Index::new(options, "data_movies").unwrap();
|
let index = Index::new(options, "data_movies").unwrap();
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
// let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
// let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||||
// loop {
|
// loop {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
|
|
||||||
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||||
|
let mut ctx = SearchContext::new(&index, &txn);
|
||||||
let results = execute_search(
|
let results = execute_search(
|
||||||
&index,
|
&mut ctx,
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
"releases from poison by the government",
|
"releases from poison by the government",
|
||||||
None,
|
None,
|
||||||
0,
|
0,
|
||||||
@ -447,24 +373,24 @@ mod tests {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
logger.write_d2_description();
|
logger.write_d2_description(&mut ctx);
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
let elapsed = start.elapsed();
|
||||||
|
|
||||||
let ids = index
|
// let ids = index
|
||||||
.documents(&txn, results.iter().copied())
|
// .documents(&txn, results.iter().copied())
|
||||||
.unwrap()
|
// .unwrap()
|
||||||
.into_iter()
|
// .into_iter()
|
||||||
.map(|x| {
|
// .map(|x| {
|
||||||
let obkv = &x.1;
|
// let obkv = &x.1;
|
||||||
let id = obkv.get(primary_key).unwrap();
|
// let id = obkv.get(primary_key).unwrap();
|
||||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
// let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||||
id.as_str().unwrap().to_owned()
|
// id.as_str().unwrap().to_owned()
|
||||||
})
|
// })
|
||||||
.collect::<Vec<_>>();
|
// .collect::<Vec<_>>();
|
||||||
|
|
||||||
println!("{}us: {results:?}", elapsed.as_micros());
|
println!("{}us: {results:?}", elapsed.as_micros());
|
||||||
println!("external ids: {ids:?}");
|
// println!("external ids: {ids:?}");
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,34 +1,28 @@
|
|||||||
use std::collections::VecDeque;
|
use super::interner::Interned;
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
|
||||||
use heed::{BytesDecode, RoTxn};
|
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::query_term::{Phrase, QueryTerm, WordDerivations};
|
use super::query_term::{Phrase, QueryTerm, WordDerivations};
|
||||||
use super::{QueryGraph, QueryNode};
|
use super::{QueryGraph, QueryNode, SearchContext};
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
|
use fxhash::FxHashMap;
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct NodeDocIdsCache {
|
pub struct NodeDocIdsCache {
|
||||||
pub cache: FxHashMap<u32, RoaringBitmap>,
|
pub cache: FxHashMap<u32, RoaringBitmap>,
|
||||||
}
|
}
|
||||||
impl NodeDocIdsCache {
|
impl<'search> SearchContext<'search> {
|
||||||
fn get_docids<'cache, 'transaction>(
|
fn get_node_docids<'cache>(
|
||||||
&'cache mut self,
|
&'cache mut self,
|
||||||
index: &Index,
|
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
term: &QueryTerm,
|
term: &QueryTerm,
|
||||||
node_idx: u32,
|
node_idx: u32,
|
||||||
) -> Result<&'cache RoaringBitmap> {
|
) -> Result<&'cache RoaringBitmap> {
|
||||||
if self.cache.contains_key(&node_idx) {
|
if self.node_docids_cache.cache.contains_key(&node_idx) {
|
||||||
return Ok(&self.cache[&node_idx]);
|
return Ok(&self.node_docids_cache.cache[&node_idx]);
|
||||||
};
|
};
|
||||||
let docids = match term {
|
let docids = match term {
|
||||||
QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,
|
QueryTerm::Phrase { phrase } => resolve_phrase(self, *phrase)?,
|
||||||
QueryTerm::Word {
|
QueryTerm::Word {
|
||||||
derivations:
|
derivations:
|
||||||
WordDerivations {
|
WordDerivations {
|
||||||
@ -42,15 +36,14 @@ impl NodeDocIdsCache {
|
|||||||
},
|
},
|
||||||
} => {
|
} => {
|
||||||
let mut or_docids = vec![];
|
let mut or_docids = vec![];
|
||||||
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
|
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied()
|
||||||
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
|
{
|
||||||
|
if let Some(word_docids) = self.get_word_docids(word)? {
|
||||||
or_docids.push(word_docids);
|
or_docids.push(word_docids);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if *use_prefix_db {
|
if *use_prefix_db {
|
||||||
if let Some(prefix_docids) =
|
if let Some(prefix_docids) = self.get_prefix_docids(*original)? {
|
||||||
db_cache.get_prefix_docids(index, txn, original.as_str())?
|
|
||||||
{
|
|
||||||
or_docids.push(prefix_docids);
|
or_docids.push(prefix_docids);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -58,32 +51,25 @@ impl NodeDocIdsCache {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
|
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
for synonym in synonyms {
|
for synonym in synonyms.iter().copied() {
|
||||||
// TODO: cache resolve_phrase?
|
// TODO: cache resolve_phrase?
|
||||||
docids.push(resolve_phrase(index, txn, db_cache, synonym)?);
|
docids.push(resolve_phrase(self, synonym)?);
|
||||||
}
|
|
||||||
if let Some((left, right)) = split_words {
|
|
||||||
if let Some(split_word_docids) =
|
|
||||||
db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?
|
|
||||||
{
|
|
||||||
docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);
|
|
||||||
}
|
}
|
||||||
|
if let Some(split_words) = split_words {
|
||||||
|
docids.push(resolve_phrase(self, *split_words)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
MultiOps::union(docids)
|
MultiOps::union(docids)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let _ = self.cache.insert(node_idx, docids);
|
let _ = self.node_docids_cache.cache.insert(node_idx, docids);
|
||||||
let docids = &self.cache[&node_idx];
|
let docids = &self.node_docids_cache.cache[&node_idx];
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn resolve_query_graph<'transaction>(
|
pub fn resolve_query_graph<'search>(
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
node_docids_cache: &mut NodeDocIdsCache,
|
|
||||||
q: &QueryGraph,
|
q: &QueryGraph,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
@ -111,8 +97,7 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
let node_docids = match n {
|
let node_docids = match n {
|
||||||
QueryNode::Term(located_term) => {
|
QueryNode::Term(located_term) => {
|
||||||
let term = &located_term.value;
|
let term = &located_term.value;
|
||||||
let derivations_docids =
|
let derivations_docids = ctx.get_node_docids(term, node)?;
|
||||||
node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
|
|
||||||
predecessors_docids & derivations_docids
|
predecessors_docids & derivations_docids
|
||||||
}
|
}
|
||||||
QueryNode::Deleted => {
|
QueryNode::Deleted => {
|
||||||
@ -143,13 +128,8 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
panic!()
|
panic!()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn resolve_phrase<'transaction>(
|
pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned<Phrase>) -> Result<RoaringBitmap> {
|
||||||
index: &Index,
|
let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
phrase: &Phrase,
|
|
||||||
) -> Result<RoaringBitmap> {
|
|
||||||
let Phrase { words } = phrase;
|
|
||||||
let mut candidates = RoaringBitmap::new();
|
let mut candidates = RoaringBitmap::new();
|
||||||
let mut first_iter = true;
|
let mut first_iter = true;
|
||||||
let winsize = words.len().min(3);
|
let winsize = words.len().min(3);
|
||||||
@ -161,19 +141,19 @@ pub fn resolve_phrase<'transaction>(
|
|||||||
for win in words.windows(winsize) {
|
for win in words.windows(winsize) {
|
||||||
// Get all the documents with the matching distance for each word pairs.
|
// Get all the documents with the matching distance for each word pairs.
|
||||||
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||||
for (offset, s1) in win
|
for (offset, &s1) in win
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||||
{
|
{
|
||||||
for (dist, s2) in win
|
for (dist, &s2) in win
|
||||||
.iter()
|
.iter()
|
||||||
.skip(offset + 1)
|
.skip(offset + 1)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||||
{
|
{
|
||||||
if dist == 0 {
|
if dist == 0 {
|
||||||
match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {
|
match ctx.get_word_pair_proximity_docids(s1, s2, 1)? {
|
||||||
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
|
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
|
||||||
// If there are no documents for this pair, there will be no
|
// If there are no documents for this pair, there will be no
|
||||||
// results for the phrase query.
|
// results for the phrase query.
|
||||||
@ -182,13 +162,9 @@ pub fn resolve_phrase<'transaction>(
|
|||||||
} else {
|
} else {
|
||||||
let mut bitmap = RoaringBitmap::new();
|
let mut bitmap = RoaringBitmap::new();
|
||||||
for dist in 0..=dist {
|
for dist in 0..=dist {
|
||||||
if let Some(m) = db_cache.get_word_pair_proximity_docids(
|
if let Some(m) =
|
||||||
index,
|
ctx.get_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
|
||||||
txn,
|
{
|
||||||
s1,
|
|
||||||
s2,
|
|
||||||
dist as u8 + 1,
|
|
||||||
)? {
|
|
||||||
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
|
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,7 @@
|
|||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::{
|
use super::{
|
||||||
RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper,
|
RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper,
|
||||||
RankingRuleQueryTrait,
|
RankingRuleQueryTrait, SearchContext,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
// facet::FacetType,
|
// facet::FacetType,
|
||||||
@ -15,18 +11,19 @@ use crate::{
|
|||||||
Index,
|
Index,
|
||||||
Result,
|
Result,
|
||||||
};
|
};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
pub struct Sort<'transaction, Query> {
|
pub struct Sort<'search, Query> {
|
||||||
field_name: String,
|
field_name: String,
|
||||||
field_id: Option<FieldId>,
|
field_id: Option<FieldId>,
|
||||||
is_ascending: bool,
|
is_ascending: bool,
|
||||||
original_query: Option<Query>,
|
original_query: Option<Query>,
|
||||||
iter: Option<RankingRuleOutputIterWrapper<'transaction, Query>>,
|
iter: Option<RankingRuleOutputIterWrapper<'search, Query>>,
|
||||||
}
|
}
|
||||||
impl<'transaction, Query> Sort<'transaction, Query> {
|
impl<'search, Query> Sort<'search, Query> {
|
||||||
pub fn new(
|
pub fn _new(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
rtxn: &'transaction heed::RoTxn,
|
rtxn: &'search heed::RoTxn,
|
||||||
field_name: String,
|
field_name: String,
|
||||||
is_ascending: bool,
|
is_ascending: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
@ -37,18 +34,14 @@ impl<'transaction, Query> Sort<'transaction, Query> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query>
|
impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort<'search, Query> {
|
||||||
for Sort<'transaction, Query>
|
|
||||||
{
|
|
||||||
fn id(&self) -> String {
|
fn id(&self) -> String {
|
||||||
let Self { field_name, is_ascending, .. } = self;
|
let Self { field_name, is_ascending, .. } = self;
|
||||||
format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " })
|
format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " })
|
||||||
}
|
}
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<Query>,
|
_logger: &mut dyn SearchLogger<Query>,
|
||||||
parent_candidates: &RoaringBitmap,
|
parent_candidates: &RoaringBitmap,
|
||||||
parent_query_graph: &Query,
|
parent_query_graph: &Query,
|
||||||
@ -59,8 +52,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
|
|||||||
if self.is_ascending { ascending_facet_sort } else { descending_facet_sort };
|
if self.is_ascending { ascending_facet_sort } else { descending_facet_sort };
|
||||||
|
|
||||||
let number_iter = make_iter(
|
let number_iter = make_iter(
|
||||||
txn,
|
ctx.txn,
|
||||||
index
|
ctx.index
|
||||||
.facet_id_f64_docids
|
.facet_id_f64_docids
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||||
field_id,
|
field_id,
|
||||||
@ -68,8 +61,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
let string_iter = make_iter(
|
let string_iter = make_iter(
|
||||||
txn,
|
ctx.txn,
|
||||||
index
|
ctx.index
|
||||||
.facet_id_string_docids
|
.facet_id_string_docids
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||||
field_id,
|
field_id,
|
||||||
@ -91,9 +84,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
|
|||||||
|
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<Query>,
|
_logger: &mut dyn SearchLogger<Query>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<Query>>> {
|
) -> Result<Option<RankingRuleOutput<Query>>> {
|
||||||
@ -110,9 +101,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
|
|||||||
|
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<Query>,
|
_logger: &mut dyn SearchLogger<Query>,
|
||||||
) {
|
) {
|
||||||
self.original_query = None;
|
self.original_query = None;
|
||||||
|
@ -1,13 +1,9 @@
|
|||||||
use std::collections::BTreeSet;
|
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
|
||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache};
|
use super::resolve_query_graph::resolve_query_graph;
|
||||||
use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput};
|
use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext};
|
||||||
use crate::{Index, Result, TermsMatchingStrategy};
|
use crate::{Result, TermsMatchingStrategy};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
pub struct Words {
|
pub struct Words {
|
||||||
exhausted: bool,
|
exhausted: bool,
|
||||||
@ -15,7 +11,6 @@ pub struct Words {
|
|||||||
iterating: bool,
|
iterating: bool,
|
||||||
positions_to_remove: Vec<i8>,
|
positions_to_remove: Vec<i8>,
|
||||||
terms_matching_strategy: TermsMatchingStrategy,
|
terms_matching_strategy: TermsMatchingStrategy,
|
||||||
node_docids_cache: NodeDocIdsCache,
|
|
||||||
}
|
}
|
||||||
impl Words {
|
impl Words {
|
||||||
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
|
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
|
||||||
@ -25,20 +20,17 @@ impl Words {
|
|||||||
iterating: false,
|
iterating: false,
|
||||||
positions_to_remove: vec![],
|
positions_to_remove: vec![],
|
||||||
terms_matching_strategy,
|
terms_matching_strategy,
|
||||||
node_docids_cache: <_>::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
impl<'search> RankingRule<'search, QueryGraph> for Words {
|
||||||
fn id(&self) -> String {
|
fn id(&self) -> String {
|
||||||
"words".to_owned()
|
"words".to_owned()
|
||||||
}
|
}
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
_parent_candidates: &RoaringBitmap,
|
_parent_candidates: &RoaringBitmap,
|
||||||
parent_query_graph: &QueryGraph,
|
parent_query_graph: &QueryGraph,
|
||||||
@ -71,9 +63,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
|
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
ctx: &mut SearchContext<'search>,
|
||||||
txn: &'transaction RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
@ -87,14 +77,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
|
|
||||||
logger.log_words_state(query_graph);
|
logger.log_words_state(query_graph);
|
||||||
|
|
||||||
let this_bucket = resolve_query_graph(
|
let this_bucket = resolve_query_graph(ctx, query_graph, universe)?;
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
&mut self.node_docids_cache,
|
|
||||||
query_graph,
|
|
||||||
universe,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let child_query_graph = query_graph.clone();
|
let child_query_graph = query_graph.clone();
|
||||||
loop {
|
loop {
|
||||||
@ -115,9 +98,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
|
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_index: &Index,
|
_ctx: &mut SearchContext<'search>,
|
||||||
_txn: &'transaction RoTxn,
|
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
|
||||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) {
|
) {
|
||||||
self.iterating = false;
|
self.iterating = false;
|
||||||
|
Loading…
Reference in New Issue
Block a user