mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 03:37:28 +01:00
squash-me
This commit is contained in:
parent
5d5b827f1a
commit
13977d9338
41
Cargo.lock
generated
41
Cargo.lock
generated
@ -292,6 +292,12 @@ version = "0.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fixedbitset"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flate2"
|
name = "flate2"
|
||||||
version = "1.0.14"
|
version = "1.0.14"
|
||||||
@ -661,6 +667,15 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itertools"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
||||||
|
dependencies = [
|
||||||
|
"either",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itertools"
|
name = "itertools"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
@ -790,12 +805,13 @@ dependencies = [
|
|||||||
"fst",
|
"fst",
|
||||||
"fxhash",
|
"fxhash",
|
||||||
"heed",
|
"heed",
|
||||||
"itertools",
|
"itertools 0.9.0",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
"memmap",
|
"memmap",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"oxidized-mtbl",
|
"oxidized-mtbl",
|
||||||
|
"pathfinding",
|
||||||
"rayon",
|
"rayon",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
@ -972,6 +988,15 @@ dependencies = [
|
|||||||
"winapi 0.3.8",
|
"winapi 0.3.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-traits"
|
||||||
|
version = "0.2.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg 1.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num_cpus"
|
name = "num_cpus"
|
||||||
version = "1.13.0"
|
version = "1.13.0"
|
||||||
@ -1016,6 +1041,18 @@ dependencies = [
|
|||||||
"winapi 0.3.8",
|
"winapi 0.3.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pathfinding"
|
||||||
|
version = "2.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86f4d8cc85ca67860ef4324faf86973a39e4e1c78338987eda29a8e6b6ec0c0e"
|
||||||
|
dependencies = [
|
||||||
|
"fixedbitset",
|
||||||
|
"indexmap",
|
||||||
|
"itertools 0.8.2",
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "percent-encoding"
|
name = "percent-encoding"
|
||||||
version = "2.1.0"
|
version = "2.1.0"
|
||||||
@ -1942,6 +1979,6 @@ checksum = "c442965efc45353be5a9b9969c9b0872fff6828c7e06d118dda2cb2d0bb11d5a"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"glob",
|
"glob",
|
||||||
"itertools",
|
"itertools 0.9.0",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
@ -27,6 +27,9 @@ smallvec = "1.4.0"
|
|||||||
structopt = { version = "0.3.14", default-features = false }
|
structopt = { version = "0.3.14", default-features = false }
|
||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
|
|
||||||
|
# best proximity
|
||||||
|
pathfinding = "2.0.4"
|
||||||
|
|
||||||
# to implement internally
|
# to implement internally
|
||||||
itertools = "0.9.0"
|
itertools = "0.9.0"
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::cmp;
|
use std::cmp;
|
||||||
|
use pathfinding::directed::dijkstra::dijkstra;
|
||||||
|
|
||||||
const ONE_ATTRIBUTE: u32 = 1000;
|
const ONE_ATTRIBUTE: u32 = 1000;
|
||||||
const MAX_INDEX: u32 = ONE_ATTRIBUTE - 1;
|
const MAX_INDEX: u32 = ONE_ATTRIBUTE - 1;
|
||||||
@ -29,107 +30,40 @@ fn construct_position(attr: u32, index: u32) -> u32 {
|
|||||||
attr * ONE_ATTRIBUTE + index
|
attr * ONE_ATTRIBUTE + index
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO we should use an sdset::Set for `next_positions`.
|
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||||
// TODO We must not recursively search for the best proximity but return None if proximity is not found.
|
struct Path(Vec<u32>);
|
||||||
// Returns the positions to focus that will give the best possible proximity.
|
|
||||||
fn best_proximity_for(current_position: u32, proximity: u32, next_positions: &[u32]) -> Option<(u32, Vec<u32>)> {
|
|
||||||
let (current_attr, _) = extract_position(current_position);
|
|
||||||
|
|
||||||
match proximity {
|
impl Path {
|
||||||
// look at i+0
|
fn new(positions: &[Vec<u32>]) -> Option<Path> {
|
||||||
0 => {
|
let position = positions.first()?.first()?;
|
||||||
match next_positions.binary_search(¤t_position) {
|
Some(Path(vec![*position]))
|
||||||
Ok(_) => Some((0, vec![current_position])),
|
|
||||||
Err(_) => best_proximity_for(current_position, proximity + 1, next_positions),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
// look at i+1
|
|
||||||
1 => {
|
|
||||||
let position = current_position + 1;
|
|
||||||
let (attr, _) = extract_position(position);
|
|
||||||
|
|
||||||
// We must check that we do not overflowed the current attribute. If so,
|
|
||||||
// we must check for a bigger proximity that we will be able to find behind.
|
|
||||||
if current_attr == attr {
|
|
||||||
match next_positions.binary_search(&position) {
|
|
||||||
Ok(_) => Some((1, vec![position])),
|
|
||||||
Err(_) => best_proximity_for(current_position, proximity + 1, next_positions),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
best_proximity_for(current_position, proximity + 1, next_positions)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
// look at i-(p-1), i+p
|
|
||||||
2..=7 => {
|
|
||||||
let mut output = Vec::new();
|
|
||||||
|
|
||||||
// Behind the current_position
|
|
||||||
if let Some(position) = current_position.checked_sub(proximity - 1) {
|
|
||||||
let (attr, _) = extract_position(position);
|
|
||||||
// We must make sure we are not looking at a word at the end of another attribute.
|
|
||||||
if current_attr == attr && next_positions.binary_search(&position).is_ok() {
|
|
||||||
output.push(position);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// In front of the current_position
|
fn successors(&self, _positions: &[Vec<u32>]) -> Vec<(Path, u32)> {
|
||||||
let position = current_position + proximity;
|
vec![]
|
||||||
let (attr, _) = extract_position(position);
|
|
||||||
// We must make sure we are not looking at a word at the end of another attribute.
|
|
||||||
if current_attr == attr && next_positions.binary_search(&position).is_ok() {
|
|
||||||
output.push(position);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if output.is_empty() {
|
fn proximity(&self) -> u32 {
|
||||||
best_proximity_for(current_position, proximity + 1, next_positions)
|
self.0.windows(2).map(|ps| positions_proximity(ps[0], ps[1])).sum::<u32>()
|
||||||
} else {
|
|
||||||
Some((proximity, output))
|
|
||||||
}
|
|
||||||
},
|
|
||||||
// look at i+8 and all above and i-(8-1) and all below
|
|
||||||
8 => {
|
|
||||||
let mut output = Vec::new();
|
|
||||||
|
|
||||||
// Make sure we look at the latest index of the previous attr.
|
|
||||||
if let Some(previous_position) = construct_position(current_attr, 0).checked_sub(1) {
|
|
||||||
let position = current_position.saturating_sub(7).max(previous_position);
|
|
||||||
match dbg!(next_positions.binary_search(&position)) {
|
|
||||||
Ok(i) => output.extend_from_slice(&next_positions[..=i]),
|
|
||||||
Err(i) => if let Some(i) = i.checked_sub(1) {
|
|
||||||
if let Some(positions) = next_positions.get(..=i) {
|
|
||||||
output.extend_from_slice(positions)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure the position doesn't overflow to the next attribute.
|
fn is_complete(&self, positions: &[Vec<u32>]) -> bool {
|
||||||
let position = (current_position + 8).min(construct_position(current_attr + 1, 0));
|
positions.len() == self.0.len()
|
||||||
match next_positions.binary_search(&position) {
|
|
||||||
Ok(i) => output.extend_from_slice(&next_positions[i..]),
|
|
||||||
Err(i) => if let Some(positions) = next_positions.get(i..) {
|
|
||||||
output.extend_from_slice(positions);
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
if output.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some((8, output))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct BestProximity {
|
pub struct BestProximity {
|
||||||
positions: Vec<Vec<u32>>,
|
positions: Vec<Vec<u32>>,
|
||||||
best_proximities: Option<Vec<u32>>,
|
best_proximity: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BestProximity {
|
impl BestProximity {
|
||||||
pub fn new(positions: Vec<Vec<u32>>) -> BestProximity {
|
pub fn new(positions: Vec<Vec<u32>>) -> BestProximity {
|
||||||
BestProximity { positions, best_proximities: None }
|
BestProximity { positions, best_proximity: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_path_successful(&self, path: &Path) -> bool {
|
||||||
|
path.is_complete(&self.positions) && path.proximity() >= self.best_proximity
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,59 +71,44 @@ impl Iterator for BestProximity {
|
|||||||
type Item = (u32, Vec<Vec<u32>>);
|
type Item = (u32, Vec<Vec<u32>>);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
match &mut self.best_proximities {
|
let mut output: Option<(u32, Vec<Vec<u32>>)> = None;
|
||||||
Some(best_proximities) => {
|
|
||||||
let expected_proximity = best_proximities.iter().sum::<u32>() + 1;
|
|
||||||
dbg!(expected_proximity);
|
|
||||||
|
|
||||||
for (i, (win, proximity)) in self.positions.windows(2).zip(best_proximities.iter()).enumerate() {
|
unimplemented!("we must use and update self.best_proximity");
|
||||||
let (posa, posb) = (&win[0], &win[1]);
|
|
||||||
dbg!(proximity, posa, posb);
|
loop {
|
||||||
let expected_proximity = proximity + 1;
|
let start = Path::new(&self.positions)?;
|
||||||
let best_proximity = posa.iter().filter_map(|pa| {
|
let result = dijkstra(
|
||||||
best_proximity_for(*pa, expected_proximity, posb).map(|res| (*pa, res))
|
&start,
|
||||||
}).min();
|
|p| p.successors(&self.positions),
|
||||||
dbg!(best_proximity);
|
|p| self.is_path_successful(p) && output.as_ref().map_or(true, |paths| !paths.1.contains(&p.0)),
|
||||||
|
);
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Some((mut paths, proximity)) => {
|
||||||
|
let positions = paths.pop().unwrap();
|
||||||
|
|
||||||
|
// If the current output is
|
||||||
|
match &mut output {
|
||||||
|
Some((best_proximity, paths)) => {
|
||||||
|
// If the shortest path we found is bigger than the one requested
|
||||||
|
// it means that we found all the paths with the same proximity and can
|
||||||
|
// return those to the user.
|
||||||
|
if proximity > *best_proximity {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
None
|
// We add the new path to the output list as this path is known
|
||||||
|
// to be the requested distance.
|
||||||
|
paths.push(positions.0);
|
||||||
},
|
},
|
||||||
None => {
|
None => output = Some((positions.proximity(), vec![positions.0])),
|
||||||
let expected_proximity = 0;
|
}
|
||||||
let mut best_results = Vec::new();
|
|
||||||
|
|
||||||
for win in self.positions.windows(2) {
|
|
||||||
let (posa, posb) = (&win[0], &win[1]);
|
|
||||||
match best_results.last() {
|
|
||||||
Some((start, _)) => {
|
|
||||||
// We know from where we must continue searching for the best path.
|
|
||||||
let (best_proximity, positions) = dbg!(best_proximity_for(*start, expected_proximity, posb).unwrap());
|
|
||||||
best_results.push((positions[0], best_proximity));
|
|
||||||
},
|
},
|
||||||
None => {
|
None => break,
|
||||||
// This is the first loop, we need to find the best start of the path.
|
|
||||||
let best_proximity = posa.iter().filter_map(|pa| {
|
|
||||||
best_proximity_for(*pa, expected_proximity, posb).map(|res| (*pa, res))
|
|
||||||
}).min();
|
|
||||||
let (pa, (best_proximity, positions)) = best_proximity.unwrap();
|
|
||||||
// We must save the best start of path we found.
|
|
||||||
best_results.push((pa, 0));
|
|
||||||
// And the next associated position along with the proximity between those.
|
|
||||||
best_results.push((positions[0], best_proximity));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if best_results.is_empty() {
|
output
|
||||||
None
|
|
||||||
} else {
|
|
||||||
let proximity = best_results.windows(2).map(|ps| positions_proximity(ps[0].0, ps[1].0)).sum::<u32>();
|
|
||||||
self.best_proximities = Some(best_results.iter().skip(1).map(|(_, p)| *p).collect());
|
|
||||||
let best_positions = best_results.into_iter().map(|(x, _)| vec![x]).collect();
|
|
||||||
Some((proximity, best_positions))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,26 +136,4 @@ mod tests {
|
|||||||
// assert_eq!(iter.next(), Some((4+5, vec![4, 1, 6]))); // 9
|
// assert_eq!(iter.next(), Some((4+5, vec![4, 1, 6]))); // 9
|
||||||
// assert_eq!(iter.next(), None);
|
// assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn easy_best_proximity_for() {
|
|
||||||
// classic
|
|
||||||
assert_eq!(best_proximity_for(0, 0, &[0]), Some((0, vec![0])));
|
|
||||||
assert_eq!(best_proximity_for(0, 1, &[0]), None);
|
|
||||||
assert_eq!(best_proximity_for(1, 1, &[0]), Some((2, vec![0])));
|
|
||||||
assert_eq!(best_proximity_for(0, 1, &[0, 1]), Some((1, vec![1])));
|
|
||||||
assert_eq!(best_proximity_for(1, 1, &[0, 2]), Some((1, vec![2])));
|
|
||||||
assert_eq!(best_proximity_for(1, 2, &[0, 2]), Some((2, vec![0])));
|
|
||||||
assert_eq!(best_proximity_for(1, 2, &[0, 3]), Some((2, vec![0, 3])));
|
|
||||||
|
|
||||||
// limits
|
|
||||||
assert_eq!(best_proximity_for(2, 7, &[0, 9]), Some((7, vec![9])));
|
|
||||||
assert_eq!(best_proximity_for(12, 7, &[6, 19]), Some((7, vec![6, 19])));
|
|
||||||
|
|
||||||
// another attribute
|
|
||||||
assert_eq!(best_proximity_for(1000, 7, &[994, 1007]), Some((7, vec![1007])));
|
|
||||||
assert_eq!(best_proximity_for(1004, 7, &[994, 1011]), Some((7, vec![1011])));
|
|
||||||
assert_eq!(best_proximity_for(1004, 8, &[900, 913, 1000, 1012, 2012]), Some((8, vec![900, 913, 1012, 2012])));
|
|
||||||
assert_eq!(best_proximity_for(1009, 8, &[900, 913, 1002, 1012, 2012]), Some((8, vec![900, 913, 1002, 2012])));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user