mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Add documentation
This commit is contained in:
parent
4e266211bf
commit
c232cdabf5
@ -1,15 +1,21 @@
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesEncode, Database, RoTxn};
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
/// Used for performance reasons only. By using this cache, we avoid performing a
|
||||
/// database lookup and instead get a direct reference to the value using a fast
|
||||
/// local HashMap lookup.
|
||||
#[derive(Default)]
|
||||
pub struct DatabaseCache<'search> {
|
||||
// TODO: interner for all database cache keys?
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> {
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
||||
}
|
||||
impl<'search> SearchContext<'search> {
|
||||
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
|
||||
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
|
||||
fn get_value<'v, K1, KC>(
|
||||
txn: &'search RoTxn,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<&'search [u8]>>,
|
||||
db: Database<KC, ByteSlice>,
|
||||
) -> Result<Option<&'search [u8]>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
let bitmap_ptr = match cache.entry(cache_key) {
|
||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = self
|
||||
.index
|
||||
.word_docids
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.get(self.txn, self.word_interner.get(word))?;
|
||||
let bitmap_ptr = db.get(txn, db_key)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
bitmap_ptr
|
||||
}
|
||||
};
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
|
||||
// In the future, this will be a frozen roaring bitmap
|
||||
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
|
||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = self
|
||||
.index
|
||||
.word_prefix_docids
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.get(self.txn, self.word_interner.get(prefix))?;
|
||||
entry.insert(bitmap_ptr);
|
||||
bitmap_ptr
|
||||
}
|
||||
};
|
||||
Ok(bitmap_ptr)
|
||||
|
||||
/// Retrieve or insert the given value in the `word_docids` database.
|
||||
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
|
||||
Self::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
}
|
||||
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
||||
pub fn get_word_prefix_docids(
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<&'search [u8]>> {
|
||||
Self::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_word_pair_proximity_docids(
|
||||
@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> {
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<&'search [u8]>> {
|
||||
let key = (proximity, word1, word2);
|
||||
match self.db_cache.word_pair_proximity_docids.entry(key) {
|
||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
// We shouldn't greedily access this DB at all
|
||||
// a DB (w1, w2) -> [proximities] would be much better
|
||||
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
|
||||
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
|
||||
// Then, to find all the proximities between two list of words, we'd do:
|
||||
|
||||
// inputs:
|
||||
// - words1 (roaring bitmap)
|
||||
// - words2 (roaring bitmap)
|
||||
// output:
|
||||
// - [(word1, word2, [proximities])]
|
||||
// algo:
|
||||
// let mut ouput = vec![];
|
||||
// for word1 in words1 {
|
||||
// let all_words_in_proximity_of_w1 = pair_words_db.get(word1);
|
||||
// let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2;
|
||||
// for word2 in words_in_proximity_of_w1 {
|
||||
// let proximties = prox_db.get(word1, word2);
|
||||
// output.push(word1, word2, proximities);
|
||||
// }
|
||||
// }
|
||||
let bitmap_ptr =
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
|
||||
self.txn,
|
||||
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
||||
)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
}
|
||||
Self::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_word_prefix_pair_proximity_docids(
|
||||
@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> {
|
||||
prefix2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<&'search [u8]>> {
|
||||
let key = (proximity, word1, prefix2);
|
||||
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
|
||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = self
|
||||
.index
|
||||
.word_prefix_pair_proximity_docids
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.get(
|
||||
self.txn,
|
||||
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
||||
)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
}
|
||||
Self::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, prefix2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
}
|
||||
pub fn get_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> {
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<&'search [u8]>> {
|
||||
let key = (proximity, left_prefix, right);
|
||||
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
|
||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = self
|
||||
.index
|
||||
.prefix_word_pair_proximity_docids
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.get(
|
||||
self.txn,
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(left_prefix),
|
||||
self.word_interner.get(right),
|
||||
),
|
||||
)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
}
|
||||
Self::get_value(
|
||||
self.txn,
|
||||
(proximity, left_prefix, right),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(left_prefix).as_str(),
|
||||
self.word_interner.get(right).as_str(),
|
||||
),
|
||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,41 @@
|
||||
/*! Implementation of a generic graph-based ranking rule.
|
||||
|
||||
A graph-based ranking rule is a ranking rule that works by representing
|
||||
its possible operations and their relevancy cost as a directed acyclic multi-graph
|
||||
built on top of the query graph. It then computes its buckets by finding the
|
||||
cheapest paths from the start node to the end node and computing the document ids
|
||||
that satisfy those paths.
|
||||
|
||||
For example, the proximity ranking rule builds a graph where the edges between two
|
||||
nodes represent a condition that the term of the source node is in a certain proximity
|
||||
to the term of the destination node. With the query "pretty house by" where the term
|
||||
"pretty" has three possible proximities to the term "house" and "house" has two
|
||||
proximities to "by", the graph will look like this:
|
||||
|
||||
```txt
|
||||
┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐
|
||||
│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │
|
||||
└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘
|
||||
```
|
||||
The proximity ranking rule's first bucket will be determined by the union of all
|
||||
the shortest paths from START to END, which in this case is:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --1--> by --0--> end
|
||||
```
|
||||
The path's corresponding document ids are found by taking the intersection of the
|
||||
document ids of each edge. That is, we find the documents where both `pretty` is
|
||||
1-close to `house` AND `house` is 1-close to `by`.
|
||||
|
||||
For the second bucket, we get the union of the second-cheapest paths, which are:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --2--> by --0--> end
|
||||
START --0-> pretty --2--> house --1--> by --0--> end
|
||||
```
|
||||
That is we find the documents where either:
|
||||
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
|
||||
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
|
||||
*/
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
@ -8,24 +46,38 @@ use super::small_bitmap::SmallBitmap;
|
||||
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
/// A generic graph-based ranking rule
|
||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||
id: String,
|
||||
// When the ranking rule is not iterating over its buckets,
|
||||
// its state is `None`.
|
||||
state: Option<GraphBasedRankingRuleState<G>>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
||||
/// Creates the ranking rule with the given identifier
|
||||
pub fn new(id: String) -> Self {
|
||||
Self { id, state: None }
|
||||
}
|
||||
}
|
||||
|
||||
/// The internal state of a graph-based ranking rule during iteration
|
||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||
/// The current graph
|
||||
graph: RankingRuleGraph<G>,
|
||||
/// Cache to retrieve the docids associated with each edge
|
||||
edge_docids_cache: EdgeDocidsCache<G>,
|
||||
/// Cache used to optimistically discard paths that resolve to no documents.
|
||||
empty_paths_cache: EmptyPathsCache,
|
||||
/// A structure giving the list of possible costs from each node to the end node,
|
||||
/// along with a set of unavoidable edges that must be traversed to achieve that distance.
|
||||
all_distances: Vec<Vec<(u16, SmallBitmap)>>,
|
||||
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
||||
cur_distance_idx: usize,
|
||||
}
|
||||
|
||||
/// Traverse each edge of the graph, computes its associated document ids,
|
||||
/// and remove this edge from the graph if its docids are disjoint with the
|
||||
/// given universe.
|
||||
fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
|
||||
ctx: &mut SearchContext<'search>,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
@ -70,6 +122,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
let mut edge_docids_cache = EdgeDocidsCache::default();
|
||||
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16);
|
||||
|
||||
// First simplify the graph as much as possible, by computing the docids of the edges
|
||||
// within the rule's universe and removing the edges that have no associated docids.
|
||||
remove_empty_edges(
|
||||
ctx,
|
||||
&mut graph,
|
||||
@ -77,6 +131,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
universe,
|
||||
&mut empty_paths_cache,
|
||||
)?;
|
||||
|
||||
// Then pre-compute the cost of all paths from each node to the end node
|
||||
let all_distances = graph.initialize_distances_with_necessary_edges();
|
||||
|
||||
let state = GraphBasedRankingRuleState {
|
||||
@ -98,9 +154,14 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
// If universe.len() <= 1, the bucket sort algorithm
|
||||
// should not have called this function.
|
||||
assert!(universe.len() > 1);
|
||||
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
||||
// should never happen
|
||||
let mut state = self.state.take().unwrap();
|
||||
|
||||
// TODO: does this have a real positive performance cost?
|
||||
remove_empty_edges(
|
||||
ctx,
|
||||
&mut state.graph,
|
||||
@ -109,12 +170,16 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
&mut state.empty_paths_cache,
|
||||
)?;
|
||||
|
||||
// If the cur_distance_idx does not point to a valid cost in the `all_distances`
|
||||
// structure, then we have computed all the buckets and can return.
|
||||
if state.cur_distance_idx
|
||||
>= state.all_distances[state.graph.query_graph.root_node as usize].len()
|
||||
{
|
||||
self.state = None;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Retrieve the cost of the paths to compute
|
||||
let (cost, _) =
|
||||
state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
|
||||
state.cur_distance_idx += 1;
|
||||
@ -129,22 +194,38 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
cur_distance_idx: _,
|
||||
} = &mut state;
|
||||
|
||||
let mut paths = vec![];
|
||||
let original_universe = universe;
|
||||
let mut universe = universe.clone();
|
||||
|
||||
// TODO: remove this unnecessary clone
|
||||
let original_graph = graph.clone();
|
||||
// and this vector as well
|
||||
let mut paths = vec![];
|
||||
|
||||
// For each path of the given cost, we will compute its associated
|
||||
// document ids.
|
||||
// In case the path does not resolve to any document id, we try to figure out why
|
||||
// and update the `empty_paths_cache` accordingly.
|
||||
// For example, it may be that the path is empty because one of its edges is disjoint
|
||||
// with the universe, or because a prefix of the path is disjoint with the universe, or because
|
||||
// the path contains two edges that are disjoint from each other within the universe.
|
||||
// Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
|
||||
// the number of future candidate paths given by that same function.
|
||||
graph.visit_paths_of_cost(
|
||||
graph.query_graph.root_node as usize,
|
||||
cost,
|
||||
all_distances,
|
||||
empty_paths_cache,
|
||||
|path, graph, empty_paths_cache| {
|
||||
// Accumulate the path for logging purposes only
|
||||
paths.push(path.to_vec());
|
||||
let mut path_docids = universe.clone();
|
||||
|
||||
// We store the edges and their docids in vectors in case the path turns out to be
|
||||
// empty and we need to figure out why it was empty.
|
||||
let mut visited_edges = vec![];
|
||||
let mut cached_edge_docids = vec![];
|
||||
|
||||
for &edge_index in path {
|
||||
visited_edges.push(edge_index);
|
||||
let edge_docids =
|
||||
@ -154,21 +235,29 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
BitmapOrAllRef::All => continue,
|
||||
};
|
||||
cached_edge_docids.push((edge_index, edge_docids.clone()));
|
||||
|
||||
// If the edge is empty, then the path will be empty as well, we update the graph
|
||||
// and caches accordingly and skip to the next candidate path.
|
||||
if edge_docids.is_disjoint(&universe) {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
empty_paths_cache.forbid_edge(edge_index);
|
||||
// 2. remove this edge from the ranking rule graph
|
||||
graph.remove_edge(edge_index);
|
||||
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
|
||||
edge_docids_cache.cache.remove(&edge_index);
|
||||
return Ok(());
|
||||
}
|
||||
path_docids &= edge_docids;
|
||||
|
||||
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
|
||||
if path_docids.is_disjoint(&universe) {
|
||||
// empty_paths_cache.forbid_prefix(&visited_edges);
|
||||
// if the intersection between this edge and any
|
||||
// First, we know that this path is empty, and thus any path
|
||||
// that is a superset of it will also be empty.
|
||||
empty_paths_cache.forbid_prefix(&visited_edges);
|
||||
// Second, if the intersection between this edge and any
|
||||
// previous one is disjoint with the universe,
|
||||
// then we add these two edges to the empty_path_cache
|
||||
// then we also know that any path containing the same couple of
|
||||
// edges will also be empty.
|
||||
for (edge_index2, edge_docids2) in
|
||||
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
|
||||
{
|
||||
@ -181,6 +270,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
}
|
||||
}
|
||||
bucket |= &path_docids;
|
||||
// Reduce the size of the universe so that we can more optimistically discard candidate paths
|
||||
universe -= path_docids;
|
||||
Ok(())
|
||||
},
|
||||
@ -196,6 +286,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
|
||||
logger,
|
||||
);
|
||||
|
||||
// TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however,
|
||||
// remove nodes and/or terms within nodes that weren't present in any of the paths.
|
||||
let next_query_graph = state.graph.query_graph.clone();
|
||||
|
||||
self.state = Some(state);
|
||||
|
@ -3,6 +3,7 @@ use std::marker::PhantomData;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
|
||||
/// An index within a [`Interner<T>`] structure.
|
||||
pub struct Interned<T> {
|
||||
idx: u32,
|
||||
_phantom: PhantomData<T>,
|
||||
@ -13,7 +14,10 @@ impl<T> Interned<T> {
|
||||
Self { idx, _phantom: PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value
|
||||
/// is then identified by a lightweight index of type [`Interned<T>`], which can
|
||||
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
|
||||
/// can be retrieved using `self.get(interned)`.
|
||||
pub struct Interner<T> {
|
||||
stable_store: Vec<T>,
|
||||
lookup: FxHashMap<T, Interned<T>>,
|
||||
|
@ -7,7 +7,82 @@ use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGrap
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{RankingRule, RankingRuleQueryTrait};
|
||||
|
||||
/// Trait for structure logging the execution of a search query.
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
/// Logs the initial query
|
||||
fn initial_query(&mut self, query: &Q);
|
||||
|
||||
/// Logs the query that was used to compute the set of all candidates
|
||||
fn query_for_universe(&mut self, query: &Q);
|
||||
|
||||
/// Logs the value of the initial set of all candidates
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap);
|
||||
|
||||
/// Logs the ranking rules used to perform the search query
|
||||
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
|
||||
|
||||
/// Logs the start of a ranking rule's iteration.
|
||||
fn start_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
query: &Q,
|
||||
universe: &RoaringBitmap,
|
||||
);
|
||||
/// Logs the end of the computation of a ranking rule bucket
|
||||
fn next_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
candidates: &RoaringBitmap,
|
||||
);
|
||||
/// Logs the skipping of a ranking rule bucket
|
||||
fn skip_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
candidates: &RoaringBitmap,
|
||||
);
|
||||
/// Logs the end of a ranking rule's iteration.
|
||||
fn end_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
);
|
||||
/// Logs the addition of document ids to the final results
|
||||
fn add_to_results(&mut self, docids: &[u32]);
|
||||
|
||||
/// Logs the internal state of the words ranking rule
|
||||
fn log_words_state(&mut self, query_graph: &Q);
|
||||
|
||||
/// Logs the internal state of the proximity ranking rule
|
||||
fn log_proximity_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<ProximityGraph>,
|
||||
paths: &[Vec<u16>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: Vec<Vec<(u16, SmallBitmap)>>,
|
||||
cost: u16,
|
||||
);
|
||||
|
||||
/// Logs the internal state of the typo ranking rule
|
||||
fn log_typo_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<TypoGraph>,
|
||||
paths: &[Vec<u16>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: Vec<Vec<(u16, SmallBitmap)>>,
|
||||
cost: u16,
|
||||
);
|
||||
}
|
||||
|
||||
/// A dummy [`SearchLogger`] which does nothing.
|
||||
pub struct DefaultSearchLogger;
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn initial_query(&mut self, _query: &Q) {}
|
||||
|
||||
@ -76,63 +151,3 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
fn initial_query(&mut self, query: &Q);
|
||||
|
||||
fn query_for_universe(&mut self, query: &Q);
|
||||
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap);
|
||||
|
||||
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
|
||||
|
||||
fn start_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
query: &Q,
|
||||
universe: &RoaringBitmap,
|
||||
);
|
||||
fn next_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
candidates: &RoaringBitmap,
|
||||
);
|
||||
fn skip_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
candidates: &RoaringBitmap,
|
||||
);
|
||||
fn end_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
);
|
||||
fn add_to_results(&mut self, docids: &[u32]);
|
||||
|
||||
fn log_words_state(&mut self, query_graph: &Q);
|
||||
|
||||
fn log_proximity_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<ProximityGraph>,
|
||||
paths: &[Vec<u16>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: Vec<Vec<(u16, SmallBitmap)>>,
|
||||
cost: u16,
|
||||
);
|
||||
|
||||
fn log_typo_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<TypoGraph>,
|
||||
paths: &[Vec<u16>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: Vec<Vec<(u16, SmallBitmap)>>,
|
||||
cost: u16,
|
||||
);
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ fn resolve_maximally_reduced_query_graph<'search>(
|
||||
break;
|
||||
} else {
|
||||
let position_to_remove = positions_to_remove.pop().unwrap();
|
||||
let _ = graph.remove_words_at_position(position_to_remove);
|
||||
let _ = graph.remove_words_starting_at_position(position_to_remove);
|
||||
}
|
||||
}
|
||||
logger.query_for_universe(&graph);
|
||||
|
@ -3,6 +3,17 @@ use super::small_bitmap::SmallBitmap;
|
||||
use super::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64;
|
||||
|
||||
/// A node of the [`QueryGraph`].
|
||||
///
|
||||
/// There are four types of nodes:
|
||||
/// 1. `Start` : unique, represents the start of the query
|
||||
/// 2. `End` : unique, represents the end of a query
|
||||
/// 3. `Deleted` : represents a node that was deleted.
|
||||
/// All deleted nodes are unreachable from the start node.
|
||||
/// 4. `Term` is a regular node representing a word or combination of words
|
||||
/// from the user query.
|
||||
#[derive(Clone)]
|
||||
pub enum QueryNode {
|
||||
Term(LocatedQueryTerm),
|
||||
@ -11,34 +22,84 @@ pub enum QueryNode {
|
||||
End,
|
||||
}
|
||||
|
||||
/// The edges associated with a node in the query graph.
|
||||
#[derive(Clone)]
|
||||
pub struct Edges {
|
||||
// TODO: use a tiny bitset instead, something like a simple Vec<u8> where most queries will see a vector of one element
|
||||
/// Set of nodes which have an edge going to the current node
|
||||
pub predecessors: SmallBitmap,
|
||||
/// Set of nodes which are reached by an edge from the current node
|
||||
pub successors: SmallBitmap,
|
||||
}
|
||||
|
||||
/**
|
||||
A graph representing all the ways to interpret the user's search query.
|
||||
|
||||
## Important
|
||||
At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes.
|
||||
|
||||
## Example 1
|
||||
For the search query `sunflower`, we need to register the following things:
|
||||
- we need to look for the exact word `sunflower`
|
||||
- but also any word which is 1 or 2 typos apart from `sunflower`
|
||||
- and every word that contains the prefix `sunflower`
|
||||
- and also the couple of adjacent words `sun flower`
|
||||
- as well as all the user-defined synonyms of `sunflower`
|
||||
|
||||
All these derivations of a word will be stored in [`WordDerivations`].
|
||||
|
||||
## Example 2:
|
||||
For the search query `summer house by`.
|
||||
|
||||
We also look for all word derivations of each term. And we also need to consider
|
||||
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
|
||||
Furthermore, we need to know which words these ngrams replace. This is done by creating the
|
||||
following graph, where each node also contains a list of derivations:
|
||||
```txt
|
||||
┌───────┐
|
||||
┌─│houseby│─────────┐
|
||||
│ └───────┘ │
|
||||
┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐
|
||||
│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │
|
||||
└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘
|
||||
│ ┌────────────┐ │ │
|
||||
├─│summerhouse │───────┘ │
|
||||
│ └────────────┘ │
|
||||
│ ┌─────────────┐ │
|
||||
└─────────│summerhouseby│───────┘
|
||||
└─────────────┘
|
||||
```
|
||||
Note also that each node has a range of positions associated with it,
|
||||
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
|
||||
is registered with the positions `1..=2`. When two nodes are connected by an edge,
|
||||
it means that they are potentially next to each other in the user's search query
|
||||
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
|
||||
and the transformations that were done on the query graph).
|
||||
*/
|
||||
#[derive(Clone)]
|
||||
pub struct QueryGraph {
|
||||
/// The index of the start node within `self.nodes`
|
||||
pub root_node: u16,
|
||||
/// The index of the end node within `self.nodes`
|
||||
pub end_node: u16,
|
||||
/// The list of all query nodes
|
||||
pub nodes: Vec<QueryNode>,
|
||||
/// The list of all node edges
|
||||
pub edges: Vec<Edges>,
|
||||
}
|
||||
|
||||
fn _assert_sizes() {
|
||||
// TODO: QueryNodes are too big now, 88B is a bit too big
|
||||
let _: [u8; 88] = [0; std::mem::size_of::<QueryNode>()];
|
||||
let _: [u8; 32] = [0; std::mem::size_of::<Edges>()];
|
||||
}
|
||||
|
||||
impl Default for QueryGraph {
|
||||
/// Create a new QueryGraph with two disconnected nodes: the root and end nodes.
|
||||
fn default() -> Self {
|
||||
let nodes = vec![QueryNode::Start, QueryNode::End];
|
||||
let edges = vec![
|
||||
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
|
||||
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
|
||||
Edges {
|
||||
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
},
|
||||
Edges {
|
||||
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
},
|
||||
];
|
||||
|
||||
Self { root_node: 0, end_node: 1, nodes, edges }
|
||||
@ -46,33 +107,31 @@ impl Default for QueryGraph {
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/// Connect all the given predecessor nodes to the given successor node
|
||||
fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) {
|
||||
for &from_node in from_nodes {
|
||||
self.edges[from_node as usize].successors.insert(to_node);
|
||||
self.edges[to_node as usize].predecessors.insert(from_node);
|
||||
}
|
||||
}
|
||||
/// Add the given node to the graph and connect it to all the given predecessor nodes
|
||||
fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 {
|
||||
let new_node_idx = self.nodes.len() as u16;
|
||||
assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT);
|
||||
self.nodes.push(node);
|
||||
self.edges.push(Edges {
|
||||
predecessors: SmallBitmap::from_array(from_nodes, 64),
|
||||
successors: SmallBitmap::new(64),
|
||||
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
});
|
||||
for from_node in from_nodes {
|
||||
self.edges[*from_node as usize].successors.insert(new_node_idx);
|
||||
}
|
||||
self.connect_to_node(from_nodes, new_node_idx);
|
||||
|
||||
new_node_idx
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
// TODO: return the list of all matching words here as well
|
||||
/// Build the query graph from the parsed user search query.
|
||||
pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
|
||||
// TODO: maybe empty nodes should not be removed here, to compute
|
||||
// the score of the `words` ranking rule correctly
|
||||
// it is very easy to traverse the graph and remove afterwards anyway
|
||||
// Still, I'm keeping this here as a demo
|
||||
let mut empty_nodes = vec![];
|
||||
|
||||
let word_set = ctx.index.words_fst(ctx.txn)?;
|
||||
@ -81,7 +140,6 @@ impl QueryGraph {
|
||||
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
|
||||
(vec![], vec![], vec![graph.root_node]);
|
||||
|
||||
// TODO: split words / synonyms
|
||||
for length in 1..=terms.len() {
|
||||
let query = &terms[..length];
|
||||
|
||||
@ -156,6 +214,8 @@ impl QueryGraph {
|
||||
|
||||
Ok(graph)
|
||||
}
|
||||
|
||||
/// Remove the given nodes and all their edges from the query graph.
|
||||
pub fn remove_nodes(&mut self, nodes: &[u16]) {
|
||||
for &node in nodes {
|
||||
self.nodes[node as usize] = QueryNode::Deleted;
|
||||
@ -166,10 +226,13 @@ impl QueryGraph {
|
||||
for succ in edges.successors.iter() {
|
||||
self.edges[succ as usize].predecessors.remove(node);
|
||||
}
|
||||
self.edges[node as usize] =
|
||||
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
|
||||
self.edges[node as usize] = Edges {
|
||||
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
};
|
||||
}
|
||||
}
|
||||
/// Remove the given nodes, connecting all their predecessors to all their successors.
|
||||
pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) {
|
||||
for &node in nodes {
|
||||
self.nodes[node as usize] = QueryNode::Deleted;
|
||||
@ -182,11 +245,17 @@ impl QueryGraph {
|
||||
self.edges[succ as usize].predecessors.remove(node);
|
||||
self.edges[succ as usize].predecessors.union(&edges.predecessors);
|
||||
}
|
||||
self.edges[node as usize] =
|
||||
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
|
||||
self.edges[node as usize] = Edges {
|
||||
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
|
||||
};
|
||||
}
|
||||
}
|
||||
pub fn remove_words_at_position(&mut self, position: i8) -> bool {
|
||||
|
||||
/// Remove all the nodes that correspond to a word starting at the given position, and connect
|
||||
/// the predecessors of these nodes to their successors.
|
||||
/// Return `true` if any node was removed.
|
||||
pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool {
|
||||
let mut nodes_to_remove_keeping_edges = vec![];
|
||||
for (node_idx, node) in self.nodes.iter().enumerate() {
|
||||
let node_idx = node_idx as u16;
|
||||
@ -202,14 +271,15 @@ impl QueryGraph {
|
||||
!nodes_to_remove_keeping_edges.is_empty()
|
||||
}
|
||||
|
||||
/// Simplify the query graph by removing all nodes that are disconnected from
|
||||
/// the start or end nodes.
|
||||
fn simplify(&mut self) {
|
||||
loop {
|
||||
let mut nodes_to_remove = vec![];
|
||||
for (node_idx, node) in self.nodes.iter().enumerate() {
|
||||
if (!matches!(node, QueryNode::End | QueryNode::Deleted)
|
||||
&& self.edges[node_idx].successors.is_empty())
|
||||
|| (!matches!(node, QueryNode::Start | QueryNode::Deleted)
|
||||
&& self.edges[node_idx].predecessors.is_empty())
|
||||
if !matches!(node, QueryNode::End | QueryNode::Deleted)
|
||||
&& (self.edges[node_idx].successors.is_empty()
|
||||
|| self.edges[node_idx].predecessors.is_empty())
|
||||
{
|
||||
nodes_to_remove.push(node_idx as u16);
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
docids |= bitmap;
|
||||
}
|
||||
if *nbr_typos == 0 {
|
||||
if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
|
||||
if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? {
|
||||
// TODO: deserialize bitmap within a universe
|
||||
let bitmap = universe
|
||||
& RoaringBitmapCodec::bytes_decode(bytes)
|
||||
|
@ -114,7 +114,7 @@ pub fn apply_ranking_rules<'search>(
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
|
||||
ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;
|
||||
|
||||
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
let mut candidates: Vec<RoaringBitmap> = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
candidates[0] = universe.clone();
|
||||
|
||||
let mut cur_ranking_rule_index = 0;
|
||||
@ -174,7 +174,7 @@ pub fn apply_ranking_rules<'search>(
|
||||
}
|
||||
} else {
|
||||
let candidates =
|
||||
candidates.iter().take(length - results.len()).collect::<Vec<_>>();
|
||||
candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
|
||||
logger.add_to_results(&candidates);
|
||||
results.extend(&candidates);
|
||||
}
|
||||
@ -234,358 +234,3 @@ pub fn apply_ranking_rules<'search>(
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// use crate::allocator::ALLOC;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
// use crate::search::new::logger::detailed::DetailedSearchLogger;
|
||||
use crate::search::new::logger::DefaultSearchLogger;
|
||||
use crate::search::new::{execute_search, SearchContext};
|
||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
||||
|
||||
#[test]
|
||||
fn search_wiki_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
|
||||
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
// let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
let mut ctx = SearchContext::new(&index, &txn);
|
||||
let results = execute_search(
|
||||
&mut ctx,
|
||||
"which a the releases from poison by the government",
|
||||
None,
|
||||
0,
|
||||
20,
|
||||
&mut DefaultSearchLogger,
|
||||
// &mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// logger.write_d2_description(&mut ctx);
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
println!("{}us", elapsed.as_micros());
|
||||
|
||||
let _documents = index
|
||||
.documents(&txn, results.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), results);
|
||||
// }
|
||||
// for (id, _document) in documents {
|
||||
// println!("{id}:");
|
||||
// // println!("{document}");
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_wiki_old() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("which a the releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let documents = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
for (id, _document) in documents {
|
||||
println!("{id}:");
|
||||
// println!("{document}");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn search_movies_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
// let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
// let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
let mut ctx = SearchContext::new(&index, &txn);
|
||||
let results = execute_search(
|
||||
&mut ctx,
|
||||
"releases from poison by the government",
|
||||
None,
|
||||
0,
|
||||
20,
|
||||
// &mut DefaultSearchLogger,
|
||||
&mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
logger.write_d2_description(&mut ctx);
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
// let ids = index
|
||||
// .documents(&txn, results.iter().copied())
|
||||
// .unwrap()
|
||||
// .into_iter()
|
||||
// .map(|x| {
|
||||
// let obkv = &x.1;
|
||||
// let id = obkv.get(primary_key).unwrap();
|
||||
// let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
// id.as_str().unwrap().to_owned()
|
||||
// })
|
||||
// .collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {results:?}", elapsed.as_micros());
|
||||
// println!("external ids: {ids:?}");
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_movies_old() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("which a the releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
println!("external ids: {ids:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _settings_movies() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
builder.set_sortable_fields(hashset! { S("release_date") });
|
||||
builder.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Asc("release_date".to_owned()),
|
||||
]);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _index_movies() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let primary_key = "id";
|
||||
let searchable_fields = vec!["title", "overview"];
|
||||
let filterable_fields = vec!["release_date", "genres"];
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
builder.set_primary_key(primary_key.to_owned());
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
|
||||
"json",
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
#[test]
|
||||
fn _index_wiki() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// let primary_key = "id";
|
||||
let searchable_fields = vec!["body", "title", "url"];
|
||||
// let filterable_fields = vec![];
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
// builder.set_primary_key(primary_key.to_owned());
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
// builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
// builder.set_min_word_len_one_typo(5);
|
||||
// builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config =
|
||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
|
||||
"csv",
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||
let reader = BufReader::new(reader);
|
||||
let documents = match filetype {
|
||||
"csv" => documents_from_csv(reader).unwrap(),
|
||||
"json" => documents_from_json(reader).unwrap(),
|
||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||
}
|
||||
|
||||
fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result.unwrap();
|
||||
documents.append_json_object(&object)?;
|
||||
}
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
documents.append_json_array(reader)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
||||
let csv = csv::Reader::from_reader(reader);
|
||||
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
documents.append_csv(csv)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ impl<'search> SearchContext<'search> {
|
||||
}
|
||||
}
|
||||
if *use_prefix_db {
|
||||
if let Some(prefix_docids) = self.get_prefix_docids(*original)? {
|
||||
if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? {
|
||||
or_docids.push(prefix_docids);
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,8 @@ impl<'search> RankingRule<'search, QueryGraph> for Words {
|
||||
break;
|
||||
} else {
|
||||
let position_to_remove = self.positions_to_remove.pop().unwrap();
|
||||
let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove);
|
||||
let did_delete_any_node =
|
||||
query_graph.remove_words_starting_at_position(position_to_remove);
|
||||
if did_delete_any_node {
|
||||
break;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user