From 0a83a86e655df04710e3ef4291c65cb5269e5ad7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Jun 2020 11:55:03 +0200 Subject: [PATCH] Fix multiple bugs --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/best_proximity.rs | 22 ++++++++----- src/bin/indexer.rs | 8 ++--- src/bin/serve.rs | 6 ++-- src/lib.rs | 73 +++++++++++++++++++++++-------------------- 6 files changed, 63 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7899d6558..b960a47c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1022,7 +1022,7 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" [[package]] name = "oxidized-mtbl" version = "0.1.0" -source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=6acef3d#6acef3d0fc7fec6a3701038860e51f8bbcee1ee6" +source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=9451be8#9451be8829562f7d1f8d34aa3ecb81c5106a0623" dependencies = [ "byteorder 1.3.4", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index 3d166be15..e19a3fd6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" -oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "6acef3d" } +oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9451be8" } rayon = "1.3.0" roaring = "0.5.2" slice-group-by = "0.2.6" diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 572b5a06e..c1b5f9395 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -3,6 +3,9 @@ use std::time::Instant; use pathfinding::directed::dijkstra::dijkstra; +use smallvec::smallvec; // the macro +use crate::SmallVec16; + const ONE_ATTRIBUTE: u32 = 1000; const MAX_DISTANCE: u32 = 8; @@ -27,17 +30,17 @@ fn extract_position(position: u32) -> (u32, u32) { } #[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -struct Path(Vec); +struct Path(SmallVec16); impl Path { fn new(positions: &[Vec]) -> Option { let position = positions.first()?.first()?; - Some(Path(vec![*position])) + Some(Path(smallvec![*position])) } // TODO we must skip the successors that have already been sent - fn successors(&self, positions: &[Vec]) -> Vec<(Path, u32)> { - let mut successors = Vec::new(); + fn successors(&self, positions: &[Vec]) -> SmallVec16<(Path, u32)> { + let mut successors = SmallVec16::new(); // If we can grow or shift the path if self.0.len() < positions.len() { @@ -103,7 +106,12 @@ impl Iterator for BestProximity { let result = dijkstra( &Path::new(&self.positions)?, |p| p.successors(&self.positions), - |p| self.is_path_successful(p) && output.as_ref().map_or(true, |(_, paths)| !paths.contains(&p.0)), + |p| { + self.is_path_successful(p) && + output.as_ref().map_or(true, |(_, paths)| { + !paths.iter().position(|q| q.as_slice() == p.0.as_slice()).is_some() + }) + }, ); match result { @@ -123,9 +131,9 @@ impl Iterator for BestProximity { // We add the new path to the output list as this path is known // to be the requested distance. - paths.push(positions.0); + paths.push(positions.0.to_vec()); }, - None => output = Some((positions.proximity(), vec![positions.0])), + None => output = Some((positions.proximity(), vec![positions.0.to_vec()])), } }, None => break, diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index a92273b03..5b44b3734 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -45,10 +45,10 @@ struct Opt { struct Indexed { fst: fst::Set>, - postings_attrs: FastMap4, - prefix_postings_attrs: FastMap4, - postings_ids: FastMap4>, - prefix_postings_ids: FastMap4>, + postings_attrs: FastMap4, RoaringBitmap>, + prefix_postings_attrs: FastMap4, RoaringBitmap>, + postings_ids: FastMap4, FastMap4>, + prefix_postings_ids: FastMap4, FastMap4>, headers: Vec, documents: Vec<(DocumentId, Vec)>, } diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 5c80b622a..28e5dff71 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -93,9 +93,9 @@ async fn main() -> anyhow::Result<()> { body.extend_from_slice(headers); for id in documents_ids { - if let Some(content) = index.documents.get(&rtxn, &BEU32::new(id)).unwrap() { - body.extend_from_slice(&content); - } + let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap(); + let content = content.expect(&format!("could not find document {}", id)); + body.extend_from_slice(&content); } } diff --git a/src/lib.rs b/src/lib.rs index 42ff2a95e..c2888334a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,7 +25,8 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); pub type FastMap4 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; -pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; +pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; +pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type BEU32 = heed::zerocopy::U32; pub type DocumentId = u32; pub type AttributeId = u32; @@ -89,52 +90,41 @@ impl Index { (word, is_prefix, dfa) }); - let mut words_positions = Vec::new(); + let mut words = Vec::new(); let mut positions = Vec::new(); let before = Instant::now(); - for (word, is_prefix, dfa) in dfas { + for (word, _is_prefix, dfa) in dfas { let mut count = 0; let mut union_positions = RoaringBitmap::default(); - if false && word.len() <= 4 && is_prefix { - if let Some(ids) = self.prefix_postings_attrs.get(rtxn, word.as_bytes())? { - let right = RoaringBitmap::deserialize_from(ids)?; + let mut derived_words = Vec::new(); + // TODO re-enable the prefixes system + let mut stream = fst.search(&dfa).into_stream(); + while let Some(word) = stream.next() { + derived_words.push(word.to_vec()); + let word = std::str::from_utf8(word)?; + if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { + let right = RoaringBitmap::deserialize_from(attrs)?; union_positions.union_with(&right); - count = 1; - } - } else { - let mut stream = fst.search(&dfa).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { - let right = RoaringBitmap::deserialize_from(attrs)?; - union_positions.union_with(&right); - count += 1; - } + count += 1; } } eprintln!("{} words for {:?} we have found positions {:?}", count, word, union_positions); - words_positions.push((word, is_prefix, dfa)); + words.push(derived_words); positions.push(union_positions.iter().collect()); } eprintln!("Retrieving words positions took {:.02?}", before.elapsed()); - // TODO re-enable the prefixes system - let mut words = Vec::new(); - for (_word, _is_prefix, dfa) in words_positions { - let mut stream = fst.search(dfa).into_stream(); - let mut derived_words = Vec::new(); - while let Some(word) = stream.next() { - derived_words.push(word.to_vec()); - } - words.push(derived_words); - } - let mut documents = Vec::new(); - 'outer: for (proximity, positions) in BestProximity::new(positions) { + for (proximity, mut positions) in BestProximity::new(positions) { + // TODO we must ignore positions paths that gives nothing + if words.len() > 1 && proximity == 0 { continue } + + positions.sort_unstable(); + let same_prox_before = Instant::now(); let mut same_proximity_union = RoaringBitmap::default(); @@ -177,15 +167,30 @@ impl Index { if let Some(intersect_docids) = intersect_docids { same_proximity_union.union_with(&intersect_docids); } - } - eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); + // We found enough documents we can stop here + if documents.iter().map(RoaringBitmap::len).sum::() + same_proximity_union.len() >= 20 { + eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); + break; + } + } documents.push(same_proximity_union); - // We found enough documents we can stop here + // We remove the double occurences of documents. + for i in 0..documents.len() { + if let Some((docs, others)) = documents[..=i].split_last_mut() { + others.iter().for_each(|other| docs.difference_with(other)); + } + } + documents.retain(|rb| !rb.is_empty()); + + eprintln!("documents: {:?}", documents); + eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); + + // We found enough documents we can stop here. if documents.iter().map(RoaringBitmap::len).sum::() >= 20 { - break 'outer; + break; } }