mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Introduce a customized A* algorithm.
This custom algo lazily compute the intersections between words, to avoid too much set operations and database reads
This commit is contained in:
parent
69285b22d3
commit
a8cda248b4
46
Cargo.lock
generated
46
Cargo.lock
generated
@ -292,12 +292,6 @@ version = "0.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fixedbitset"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flate2"
|
name = "flate2"
|
||||||
version = "1.0.14"
|
version = "1.0.14"
|
||||||
@ -642,9 +636,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "indexmap"
|
name = "indexmap"
|
||||||
version = "1.3.2"
|
version = "1.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "076f042c5b7b98f31d205f1249267e12a6518c1481e9dae9764af19b707d2292"
|
checksum = "c398b2b113b55809ceb9ee3e753fcbac793f1956663f3c36549c1346015c2afe"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg 1.0.0",
|
"autocfg 1.0.0",
|
||||||
]
|
]
|
||||||
@ -667,15 +661,6 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "itertools"
|
|
||||||
version = "0.8.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
|
||||||
dependencies = [
|
|
||||||
"either",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itertools"
|
name = "itertools"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
@ -805,13 +790,13 @@ dependencies = [
|
|||||||
"fst",
|
"fst",
|
||||||
"fxhash",
|
"fxhash",
|
||||||
"heed",
|
"heed",
|
||||||
"itertools 0.9.0",
|
"indexmap",
|
||||||
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
"memmap",
|
"memmap",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"oxidized-mtbl",
|
"oxidized-mtbl",
|
||||||
"pathfinding",
|
|
||||||
"rayon",
|
"rayon",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
@ -988,15 +973,6 @@ dependencies = [
|
|||||||
"winapi 0.3.8",
|
"winapi 0.3.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-traits"
|
|
||||||
version = "0.2.11"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096"
|
|
||||||
dependencies = [
|
|
||||||
"autocfg 1.0.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num_cpus"
|
name = "num_cpus"
|
||||||
version = "1.13.0"
|
version = "1.13.0"
|
||||||
@ -1041,18 +1017,6 @@ dependencies = [
|
|||||||
"winapi 0.3.8",
|
"winapi 0.3.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pathfinding"
|
|
||||||
version = "2.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "86f4d8cc85ca67860ef4324faf86973a39e4e1c78338987eda29a8e6b6ec0c0e"
|
|
||||||
dependencies = [
|
|
||||||
"fixedbitset",
|
|
||||||
"indexmap",
|
|
||||||
"itertools 0.8.2",
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "percent-encoding"
|
name = "percent-encoding"
|
||||||
version = "2.1.0"
|
version = "2.1.0"
|
||||||
@ -1979,6 +1943,6 @@ checksum = "c442965efc45353be5a9b9969c9b0872fff6828c7e06d118dda2cb2d0bb11d5a"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"glob",
|
"glob",
|
||||||
"itertools 0.9.0",
|
"itertools",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
@ -28,7 +28,7 @@ structopt = { version = "0.3.14", default-features = false }
|
|||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
|
|
||||||
# best proximity
|
# best proximity
|
||||||
pathfinding = "2.0.4"
|
indexmap = "1.4.0"
|
||||||
|
|
||||||
# to implement internally
|
# to implement internally
|
||||||
itertools = "0.9.0"
|
itertools = "0.9.0"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use pathfinding::directed::astar::astar_bag;
|
use crate::iter_shortest_paths::astar_bag;
|
||||||
|
|
||||||
const ONE_ATTRIBUTE: u32 = 1000;
|
const ONE_ATTRIBUTE: u32 = 1000;
|
||||||
const MAX_DISTANCE: u32 = 8;
|
const MAX_DISTANCE: u32 = 8;
|
||||||
@ -37,6 +37,8 @@ enum Node {
|
|||||||
position: u32,
|
position: u32,
|
||||||
// The total accumulated proximity until this node, used for skipping nodes.
|
// The total accumulated proximity until this node, used for skipping nodes.
|
||||||
acc_proximity: u32,
|
acc_proximity: u32,
|
||||||
|
// The parent position from the above layer.
|
||||||
|
parent_position: u32,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,35 +46,29 @@ impl Node {
|
|||||||
// TODO we must skip the successors that have already been seen
|
// TODO we must skip the successors that have already been seen
|
||||||
// TODO we must skip the successors that doesn't return any documents
|
// TODO we must skip the successors that doesn't return any documents
|
||||||
// this way we are able to skip entire paths
|
// this way we are able to skip entire paths
|
||||||
fn successors<F>(
|
fn successors(&self, positions: &[Vec<u32>], best_proximity: u32) -> Vec<(Node, u32)> {
|
||||||
&self,
|
|
||||||
positions: &[Vec<u32>],
|
|
||||||
best_proximity: u32,
|
|
||||||
mut contains_documents: F,
|
|
||||||
) -> Vec<(Node, u32)>
|
|
||||||
where F: FnMut((usize, u32), (usize, u32)) -> bool,
|
|
||||||
{
|
|
||||||
match self {
|
match self {
|
||||||
Node::Uninit => {
|
Node::Uninit => {
|
||||||
positions[0].iter().map(|p| {
|
positions[0].iter().map(|p| {
|
||||||
(Node::Init { layer: 0, position: *p, acc_proximity: 0 }, 0)
|
(Node::Init { layer: 0, position: *p, acc_proximity: 0, parent_position: 0 }, 0)
|
||||||
}).collect()
|
}).collect()
|
||||||
},
|
},
|
||||||
// We reached the highest layer
|
// We reached the highest layer
|
||||||
n @ Node::Init { .. } if n.is_complete(positions) => vec![],
|
n @ Node::Init { .. } if n.is_complete(positions) => vec![],
|
||||||
Node::Init { layer, position, acc_proximity } => {
|
Node::Init { layer, position, acc_proximity, .. } => {
|
||||||
positions[layer + 1].iter().filter_map(|p| {
|
positions[layer + 1].iter().filter_map(|p| {
|
||||||
let proximity = positions_proximity(*position, *p);
|
let proximity = positions_proximity(*position, *p);
|
||||||
let node = Node::Init { layer: layer + 1, position: *p, acc_proximity: acc_proximity + proximity };
|
let node = Node::Init {
|
||||||
if (contains_documents)((*layer, *position), (layer + 1, *p)) {
|
layer: layer + 1,
|
||||||
// We do not produce the nodes we have already seen in previous iterations loops.
|
position: *p,
|
||||||
if node.is_complete(positions) && acc_proximity + proximity < best_proximity {
|
acc_proximity: acc_proximity + proximity,
|
||||||
None
|
parent_position: *position,
|
||||||
} else {
|
};
|
||||||
Some((node, proximity))
|
// We do not produce the nodes we have already seen in previous iterations loops.
|
||||||
}
|
if node.is_complete(positions) && acc_proximity + proximity < best_proximity {
|
||||||
} else {
|
|
||||||
None
|
None
|
||||||
|
} else {
|
||||||
|
Some((node, proximity))
|
||||||
}
|
}
|
||||||
}).collect()
|
}).collect()
|
||||||
}
|
}
|
||||||
@ -92,6 +88,35 @@ impl Node {
|
|||||||
Node::Init { position, .. } => Some(*position),
|
Node::Init { position, .. } => Some(*position),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn proximity(&self) -> u32 {
|
||||||
|
match self {
|
||||||
|
Node::Uninit => 0,
|
||||||
|
Node::Init { layer, position, acc_proximity, parent_position } => {
|
||||||
|
if layer.checked_sub(1).is_some() {
|
||||||
|
acc_proximity + positions_proximity(*position, *parent_position)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_reachable<F>(&self, mut contains_documents: F) -> bool
|
||||||
|
where F: FnMut((usize, u32), (usize, u32)) -> bool,
|
||||||
|
{
|
||||||
|
match self {
|
||||||
|
Node::Uninit => true,
|
||||||
|
Node::Init { layer, position, parent_position, .. } => {
|
||||||
|
match layer.checked_sub(1) {
|
||||||
|
Some(parent_layer) => {
|
||||||
|
(contains_documents)((parent_layer, *parent_position), (*layer, *position))
|
||||||
|
},
|
||||||
|
None => true,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct BestProximity<F> {
|
pub struct BestProximity<F> {
|
||||||
@ -102,7 +127,7 @@ pub struct BestProximity<F> {
|
|||||||
|
|
||||||
impl<F> BestProximity<F> {
|
impl<F> BestProximity<F> {
|
||||||
pub fn new(positions: Vec<Vec<u32>>, contains_documents: F) -> BestProximity<F> {
|
pub fn new(positions: Vec<Vec<u32>>, contains_documents: F) -> BestProximity<F> {
|
||||||
let best_proximity = positions.len() as u32 - 1;
|
let best_proximity = (positions.len() as u32).saturating_sub(1);
|
||||||
BestProximity { positions, best_proximity, contains_documents }
|
BestProximity { positions, best_proximity, contains_documents }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -121,9 +146,12 @@ where F: FnMut((usize, u32), (usize, u32)) -> bool + Copy,
|
|||||||
|
|
||||||
let result = astar_bag(
|
let result = astar_bag(
|
||||||
&Node::Uninit, // start
|
&Node::Uninit, // start
|
||||||
|n| n.successors(&self.positions, self.best_proximity, self.contains_documents),
|
|n| n.successors(&self.positions, self.best_proximity),
|
||||||
|_| 0, // heuristic
|
|_| 0, // heuristic
|
||||||
|n| n.is_complete(&self.positions), // success
|
|n| { // success
|
||||||
|
let c = n.is_complete(&self.positions) && n.proximity() >= self.best_proximity;
|
||||||
|
if n.is_reachable(self.contains_documents) { Some(c) } else { None }
|
||||||
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
eprintln!("BestProximity::next() took {:.02?}", before.elapsed());
|
eprintln!("BestProximity::next() took {:.02?}", before.elapsed());
|
||||||
|
204
src/iter_shortest_paths.rs
Normal file
204
src/iter_shortest_paths.rs
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::{BinaryHeap, HashSet};
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::usize;
|
||||||
|
|
||||||
|
use indexmap::map::Entry::{Occupied, Vacant};
|
||||||
|
use indexmap::IndexMap;
|
||||||
|
|
||||||
|
pub fn astar_bag<N, FN, IN, FH, FS>(
|
||||||
|
start: &N,
|
||||||
|
mut successors: FN,
|
||||||
|
mut heuristic: FH,
|
||||||
|
mut success: FS,
|
||||||
|
) -> Option<(AstarSolution<N>, u32)>
|
||||||
|
where
|
||||||
|
N: Eq + Hash + Clone,
|
||||||
|
FN: FnMut(&N) -> IN,
|
||||||
|
IN: IntoIterator<Item = (N, u32)>,
|
||||||
|
FH: FnMut(&N) -> u32,
|
||||||
|
FS: FnMut(&N) -> Option<bool>,
|
||||||
|
{
|
||||||
|
let mut to_see = BinaryHeap::new();
|
||||||
|
let mut min_cost = None;
|
||||||
|
let mut sinks = HashSet::new();
|
||||||
|
to_see.push(SmallestCostHolder {
|
||||||
|
estimated_cost: heuristic(start),
|
||||||
|
cost: 0,
|
||||||
|
index: 0,
|
||||||
|
});
|
||||||
|
let mut parents: IndexMap<N, (HashSet<usize>, u32)> = IndexMap::new();
|
||||||
|
parents.insert(start.clone(), (HashSet::new(), 0));
|
||||||
|
while let Some(SmallestCostHolder { cost, index, estimated_cost, .. }) = to_see.pop() {
|
||||||
|
if let Some(min_cost) = min_cost {
|
||||||
|
if estimated_cost > min_cost {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let successors = {
|
||||||
|
let (node, &(_, c)) = parents.get_index(index).unwrap();
|
||||||
|
// We check that the node is even reachable and if so if it is an answer.
|
||||||
|
// If this node is unreachable we skip it.
|
||||||
|
match success(node) {
|
||||||
|
Some(success) => if success {
|
||||||
|
min_cost = Some(cost);
|
||||||
|
sinks.insert(index);
|
||||||
|
},
|
||||||
|
None => continue,
|
||||||
|
}
|
||||||
|
|
||||||
|
// We may have inserted a node several time into the binary heap if we found
|
||||||
|
// a better way to access it. Ensure that we are currently dealing with the
|
||||||
|
// best path and discard the others.
|
||||||
|
if cost > c {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
successors(node)
|
||||||
|
};
|
||||||
|
for (successor, move_cost) in successors {
|
||||||
|
let new_cost = cost + move_cost;
|
||||||
|
let h; // heuristic(&successor)
|
||||||
|
let n; // index for successor
|
||||||
|
match parents.entry(successor) {
|
||||||
|
Vacant(e) => {
|
||||||
|
h = heuristic(e.key());
|
||||||
|
n = e.index();
|
||||||
|
let mut p = HashSet::new();
|
||||||
|
p.insert(index);
|
||||||
|
e.insert((p, new_cost));
|
||||||
|
}
|
||||||
|
Occupied(mut e) => {
|
||||||
|
if e.get().1 > new_cost {
|
||||||
|
h = heuristic(e.key());
|
||||||
|
n = e.index();
|
||||||
|
let s = e.get_mut();
|
||||||
|
s.0.clear();
|
||||||
|
s.0.insert(index);
|
||||||
|
s.1 = new_cost;
|
||||||
|
} else {
|
||||||
|
if e.get().1 == new_cost {
|
||||||
|
// New parent with an identical cost, this is not
|
||||||
|
// considered as an insertion.
|
||||||
|
e.get_mut().0.insert(index);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
to_see.push(SmallestCostHolder {
|
||||||
|
estimated_cost: new_cost + h,
|
||||||
|
cost: new_cost,
|
||||||
|
index: n,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
min_cost.map(|cost| {
|
||||||
|
let parents = parents
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, (ps, _))| (k, ps.into_iter().collect()))
|
||||||
|
.collect();
|
||||||
|
(
|
||||||
|
AstarSolution {
|
||||||
|
sinks: sinks.into_iter().collect(),
|
||||||
|
parents,
|
||||||
|
current: vec![],
|
||||||
|
terminated: false,
|
||||||
|
},
|
||||||
|
cost,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SmallestCostHolder<K> {
|
||||||
|
estimated_cost: K,
|
||||||
|
cost: K,
|
||||||
|
index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K: PartialEq> PartialEq for SmallestCostHolder<K> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.estimated_cost.eq(&other.estimated_cost) && self.cost.eq(&other.cost)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K: PartialEq> Eq for SmallestCostHolder<K> {}
|
||||||
|
|
||||||
|
impl<K: Ord> PartialOrd for SmallestCostHolder<K> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K: Ord> Ord for SmallestCostHolder<K> {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
match other.estimated_cost.cmp(&self.estimated_cost) {
|
||||||
|
Ordering::Equal => self.cost.cmp(&other.cost),
|
||||||
|
s => s,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterator structure created by the `astar_bag` function.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AstarSolution<N> {
|
||||||
|
sinks: Vec<usize>,
|
||||||
|
parents: Vec<(N, Vec<usize>)>,
|
||||||
|
current: Vec<Vec<usize>>,
|
||||||
|
terminated: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N: Clone + Eq + Hash> AstarSolution<N> {
|
||||||
|
fn complete(&mut self) {
|
||||||
|
loop {
|
||||||
|
let ps = match self.current.last() {
|
||||||
|
None => self.sinks.clone(),
|
||||||
|
Some(last) => {
|
||||||
|
let &top = last.last().unwrap();
|
||||||
|
self.parents(top).clone()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if ps.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
self.current.push(ps);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_vec(&mut self) {
|
||||||
|
while self.current.last().map(Vec::len) == Some(1) {
|
||||||
|
self.current.pop();
|
||||||
|
}
|
||||||
|
self.current.last_mut().map(Vec::pop);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn node(&self, i: usize) -> &N {
|
||||||
|
&self.parents[i].0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parents(&self, i: usize) -> &Vec<usize> {
|
||||||
|
&self.parents[i].1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N: Clone + Eq + Hash> Iterator for AstarSolution<N> {
|
||||||
|
type Item = Vec<N>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.terminated {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
self.complete();
|
||||||
|
let path = self
|
||||||
|
.current
|
||||||
|
.iter()
|
||||||
|
.rev()
|
||||||
|
.map(|v| v.last().cloned().unwrap())
|
||||||
|
.map(|i| self.node(i).clone())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
self.next_vec();
|
||||||
|
self.terminated = self.current.is_empty();
|
||||||
|
Some(path)
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,5 @@
|
|||||||
mod best_proximity;
|
mod best_proximity;
|
||||||
|
mod iter_shortest_paths;
|
||||||
mod query_tokens;
|
mod query_tokens;
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
Loading…
Reference in New Issue
Block a user