Use another function to define the proximity

This commit is contained in:
Clément Renault 2020-09-06 17:33:26 +02:00
parent f928b91e9d
commit bb1ab428db
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 36 additions and 7 deletions

2
Cargo.lock generated
View File

@ -1145,7 +1145,7 @@ dependencies = [
[[package]] [[package]]
name = "near-proximity" name = "near-proximity"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=2f5ad5c#2f5ad5cdafde54731cd75d17ec6228ea3ca1f9b4" source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7"
dependencies = [ dependencies = [
"tinyvec", "tinyvec",
] ]

View File

@ -22,7 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
memmap = "0.7.0" memmap = "0.7.0"
once_cell = "1.4.0" once_cell = "1.4.0"
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" } oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "2f5ad5c" } near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
rayon = "1.3.1" rayon = "1.3.1"
ringtail = "0.3.0" ringtail = "0.3.0"
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }

View File

@ -199,13 +199,14 @@ impl Store {
let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
let mut buffer = Vec::new(); let mut buffer = Vec::new();
// We serialize the document ids into a buffer
// We prefix the words by the document id. // We prefix the words by the document id.
key.extend_from_slice(&id.to_be_bytes()); key.extend_from_slice(&id.to_be_bytes());
let base_size = key.len();
for (word, positions) in iter { for (word, positions) in iter {
key.truncate(1 + 4); key.truncate(base_size);
key.extend_from_slice(word.as_bytes()); key.extend_from_slice(word.as_bytes());
// We serialize the positions into a buffer.
buffer.clear(); buffer.clear();
buffer.reserve(positions.serialized_size()); buffer.reserve(positions.serialized_size());
positions.serialize_into(&mut buffer)?; positions.serialize_into(&mut buffer)?;

View File

@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::cmp;
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
@ -10,7 +11,7 @@ use roaring::bitmap::{IntoIter, RoaringBitmap};
use near_proximity::near_proximity; use near_proximity::near_proximity;
use crate::query_tokens::{QueryTokens, QueryToken}; use crate::query_tokens::{QueryTokens, QueryToken};
use crate::{Index, DocumentId}; use crate::{Index, DocumentId, Position};
// Building these factories is not free. // Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -153,7 +154,7 @@ impl<'a> Search<'a> {
if docids.contains(candidate) { if docids.contains(candidate) {
match index.docid_word_positions.get(rtxn, &(candidate, word))? { match index.docid_word_positions.get(rtxn, &(candidate, word))? {
Some(positions) => union_positions.union_with(&positions), Some(positions) => union_positions.union_with(&positions),
None => error!("position missing for candidate {} and word {}", candidate, word), None => error!("position missing for candidate {} and word {:?}", candidate, word),
} }
} }
} }
@ -194,10 +195,37 @@ impl<'a> Search<'a> {
let min_proximity = derived_words.len() as u32 - 1; let min_proximity = derived_words.len() as u32 - 1;
let mut number_min_proximity = 0; let mut number_min_proximity = 0;
// TODO move this function elsewhere
fn compute_proximity(path: &[Position]) -> u32 {
const ONE_ATTRIBUTE: u32 = 1000;
const MAX_DISTANCE: u32 = 8;
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs <= rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min((lhs - rhs) + 1, MAX_DISTANCE)
}
}
fn positions_proximity(lhs: u32, rhs: u32) -> u32 {
let (lhs_attr, lhs_index) = extract_position(lhs);
let (rhs_attr, rhs_index) = extract_position(rhs);
if lhs_attr != rhs_attr { MAX_DISTANCE }
else { index_proximity(lhs_index, rhs_index) }
}
fn extract_position(position: u32) -> (u32, u32) {
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
}
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
}
let mut paths = Vec::new(); let mut paths = Vec::new();
for candidate in candidates { for candidate in candidates {
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?; let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?;
near_proximity(keywords, &mut paths); near_proximity(keywords, &mut paths, compute_proximity);
if let Some((prox, _path)) = paths.first() { if let Some((prox, _path)) = paths.first() {
documents.push((*prox, candidate)); documents.push((*prox, candidate));
if *prox == min_proximity { if *prox == min_proximity {