Introduce the plane-sweep algorithm

This commit is contained in:
Clément Renault 2020-09-05 18:25:27 +02:00
parent dc88a86259
commit 1c504471d3
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 73 additions and 3 deletions

24
Cargo.lock generated
View File

@ -997,6 +997,7 @@ dependencies = [
"levenshtein_automata",
"log 0.4.11",
"memmap",
"near-proximity",
"once_cell",
"oxidized-mtbl",
"rayon",
@ -1141,6 +1142,14 @@ dependencies = [
"twoway",
]
[[package]]
name = "near-proximity"
version = "0.1.0"
source = "git+https://github.com/Kerollmops/plane-sweep-proximity#f6c9e7e5fc1f7b456d080981e877d0e7943f82bd"
dependencies = [
"tinyvec",
]
[[package]]
name = "net2"
version = "0.2.34"
@ -1964,6 +1973,21 @@ dependencies = [
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f331a553cacb14e99d183e5573c86044dd177b5a5277b21e562fd1bd5e1076e1"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "0.2.21"

View File

@ -22,6 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
memmap = "0.7.0"
once_cell = "1.4.0"
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity" }
rayon = "1.3.1"
ringtail = "0.3.0"
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }

View File

@ -5,7 +5,9 @@ use levenshtein_automata::DFA;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use log::debug;
use once_cell::sync::Lazy;
use roaring::RoaringBitmap;
use roaring::bitmap::{IntoIter, RoaringBitmap};
use near_proximity::near_proximity;
use crate::query_tokens::{QueryTokens, QueryToken};
use crate::{Index, DocumentId, Position, Attribute};
@ -136,6 +138,31 @@ impl<'a> Search<'a> {
Ok(candidates)
}
fn fecth_keywords(
rtxn: &heed::RoTxn,
index: &Index,
derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
candidate: DocumentId,
) -> anyhow::Result<Vec<IntoIter>>
{
let mut keywords = Vec::with_capacity(derived_words.len());
for (words, _) in derived_words {
let mut union_positions = RoaringBitmap::new();
for (word, (_distance, docids)) in words {
if docids.contains(candidate) {
let positions = index.word_docid_positions.get(rtxn, &(word, candidate))?.unwrap();
union_positions.union_with(&positions);
}
}
keywords.push(union_positions.into_iter());
}
Ok(keywords)
}
pub fn execute(&self) -> anyhow::Result<SearchResult> {
let rtxn = self.rtxn;
let index = self.index;
@ -162,10 +189,28 @@ impl<'a> Search<'a> {
debug!("candidates: {:?}", candidates);
let documents = vec![candidates];
let mut documents = Vec::new();
let min_proximity = derived_words.len() as u32;
let mut number_min_proximity = 0;
let mut paths = Vec::new();
for candidate in candidates {
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?;
near_proximity(keywords, &mut paths);
if let Some((prox, _path)) = paths.first() {
documents.push((*prox, candidate));
if *prox == min_proximity {
number_min_proximity += 1;
if number_min_proximity >= limit { break }
}
}
}
documents.sort_unstable_by_key(|(prox, _)| *prox);
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
let documents_ids = documents.iter().flatten().take(limit).collect();
let documents_ids = documents.into_iter().map(|(_, id)| id).take(limit).collect();
Ok(SearchResult { found_words, documents_ids })
}