diff --git a/Cargo.lock b/Cargo.lock index b68215ac9..390983638 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1145,7 +1145,7 @@ dependencies = [ [[package]] name = "near-proximity" version = "0.1.0" -source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=2f5ad5c#2f5ad5cdafde54731cd75d17ec6228ea3ca1f9b4" +source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7" dependencies = [ "tinyvec", ] diff --git a/Cargo.toml b/Cargo.toml index 4e130c583..39e155d87 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" } -near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "2f5ad5c" } +near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } rayon = "1.3.1" ringtail = "0.3.0" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 1a37874c0..e66d63d7d 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -199,13 +199,14 @@ impl Store { let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; let mut buffer = Vec::new(); - // We serialize the document ids into a buffer // We prefix the words by the document id. key.extend_from_slice(&id.to_be_bytes()); + let base_size = key.len(); for (word, positions) in iter { - key.truncate(1 + 4); + key.truncate(base_size); key.extend_from_slice(word.as_bytes()); + // We serialize the positions into a buffer. buffer.clear(); buffer.reserve(positions.serialized_size()); positions.serialize_into(&mut buffer)?; diff --git a/src/search.rs b/src/search.rs index 9758862e7..a545ab77d 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::cmp; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; @@ -10,7 +11,7 @@ use roaring::bitmap::{IntoIter, RoaringBitmap}; use near_proximity::near_proximity; use crate::query_tokens::{QueryTokens, QueryToken}; -use crate::{Index, DocumentId}; +use crate::{Index, DocumentId, Position}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -153,7 +154,7 @@ impl<'a> Search<'a> { if docids.contains(candidate) { match index.docid_word_positions.get(rtxn, &(candidate, word))? { Some(positions) => union_positions.union_with(&positions), - None => error!("position missing for candidate {} and word {}", candidate, word), + None => error!("position missing for candidate {} and word {:?}", candidate, word), } } } @@ -194,10 +195,37 @@ impl<'a> Search<'a> { let min_proximity = derived_words.len() as u32 - 1; let mut number_min_proximity = 0; + // TODO move this function elsewhere + fn compute_proximity(path: &[Position]) -> u32 { + const ONE_ATTRIBUTE: u32 = 1000; + const MAX_DISTANCE: u32 = 8; + + fn index_proximity(lhs: u32, rhs: u32) -> u32 { + if lhs <= rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min((lhs - rhs) + 1, MAX_DISTANCE) + } + } + + fn positions_proximity(lhs: u32, rhs: u32) -> u32 { + let (lhs_attr, lhs_index) = extract_position(lhs); + let (rhs_attr, rhs_index) = extract_position(rhs); + if lhs_attr != rhs_attr { MAX_DISTANCE } + else { index_proximity(lhs_index, rhs_index) } + } + + fn extract_position(position: u32) -> (u32, u32) { + (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) + } + + path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() + } + let mut paths = Vec::new(); for candidate in candidates { let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?; - near_proximity(keywords, &mut paths); + near_proximity(keywords, &mut paths, compute_proximity); if let Some((prox, _path)) = paths.first() { documents.push((*prox, candidate)); if *prox == min_proximity {