From f6a40ed7e4afb395c28911cb48607c3b7cb1069b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 10 Sep 2018 19:47:40 +0200 Subject: [PATCH] feat: Replace the HashMap by a simple Vec in stream ops --- raptor/src/metadata/doc_indexes.rs | 5 +-- raptor/src/metadata/mod.rs | 6 +--- raptor/src/metadata/ops.rs | 41 +++++++++++------------- raptor/src/metadata/ops_indexed_value.rs | 8 ++++- raptor/src/rank/mod.rs | 1 - 5 files changed, 28 insertions(+), 33 deletions(-) diff --git a/raptor/src/metadata/doc_indexes.rs b/raptor/src/metadata/doc_indexes.rs index 8cc81ae88..5aef15baa 100644 --- a/raptor/src/metadata/doc_indexes.rs +++ b/raptor/src/metadata/doc_indexes.rs @@ -5,8 +5,8 @@ use std::path::Path; use std::ops::Deref; use std::sync::Arc; use std::mem; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::raw::MmapReadOnly; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use crate::DocIndex; #[repr(C)] @@ -180,9 +180,6 @@ fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, let mut ranges = Vec::with_capacity(cap); let mut values = Vec::with_capacity(number_docs); - // @Improvement: remove bounds duplications: the left bound of a range - // is already the right bound of the previous range, - // we could use a slice window of size 2. for v in &vecs { let len = v.len() as u64; let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); diff --git a/raptor/src/metadata/mod.rs b/raptor/src/metadata/mod.rs index eba764b3d..7c63d2d53 100644 --- a/raptor/src/metadata/mod.rs +++ b/raptor/src/metadata/mod.rs @@ -1,15 +1,13 @@ -// pub mod difference; -// pub mod stream_ops; mod ops_indexed_value; pub mod ops; pub mod doc_indexes; use fst::{Map, MapBuilder}; -use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; use std::error::Error; use std::path::Path; use std::io::Write; use crate::DocIndex; +use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; pub struct Metadata { map: Map, @@ -87,8 +85,6 @@ impl MetadataBuilder { #[cfg(test)] mod tests { use super::*; - use crate::vec_read_only::VecReadOnly; - use crate::metadata::ops::IndexedDocIndexes; #[test] fn empty_serialize_deserialize() { diff --git a/raptor/src/metadata/ops.rs b/raptor/src/metadata/ops.rs index 448279e94..666a533b3 100644 --- a/raptor/src/metadata/ops.rs +++ b/raptor/src/metadata/ops.rs @@ -1,5 +1,4 @@ -use std::hash::{Hash, Hasher}; -use std::collections::{HashMap, BTreeMap}; +use std::collections::BTreeMap; use fst::{map, Streamer, Automaton}; use fst::automaton::AlwaysMatch; use sdset::multi::OpBuilder as SdOpBuilder; @@ -9,7 +8,6 @@ use crate::metadata::ops_indexed_value::{ }; use crate::metadata::doc_indexes::DocIndexes; use crate::metadata::Metadata; -use crate::automaton::AutomatonExt; use crate::vec_read_only::VecReadOnly; use crate::DocIndex; @@ -60,19 +58,19 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> { } pub fn union(self) -> Union<'m> { - Union::new(self.maps, self.indexes) + Union::new(self.maps, self.indexes, self.automatons.len()) } pub fn intersection(self) -> Intersection<'m> { - Intersection::new(self.maps, self.indexes) + Intersection::new(self.maps, self.indexes, self.automatons.len()) } pub fn difference(self) -> Difference<'m> { - Difference::new(self.maps, self.indexes) + Difference::new(self.maps, self.indexes, self.automatons.len()) } pub fn symmetric_difference(self) -> SymmetricDifference<'m> { - SymmetricDifference::new(self.maps, self.indexes) + SymmetricDifference::new(self.maps, self.indexes, self.automatons.len()) } } @@ -94,15 +92,16 @@ macro_rules! logical_operation { pub struct $name<'m> { maps: UnionIndexedValue<'m>, indexes: Vec<&'m DocIndexes>, + number_automatons: usize, outs: Vec, } impl<'m> $name<'m> { - fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>) -> Self - { + fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self { $name { maps: maps.union(), indexes: indexes, + number_automatons: number_automatons, outs: Vec::new(), } } @@ -116,17 +115,15 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> { Some((input, ivalues)) => { self.outs.clear(); - // @Improvement: better use a `Vec` instead, - // `aut indexes` follow them selfs - let mut builders = HashMap::new(); + let mut builders = vec![BTreeMap::new(); self.number_automatons]; for iv in ivalues { - let builder = builders.entry(iv.aut_index).or_insert_with(BTreeMap::new); + let builder = &mut builders[iv.aut_index]; builder.insert(iv.rdr_index, iv.value); } let mut doc_indexes = Vec::new(); let mut doc_indexes_slots = Vec::with_capacity(builders.len()); - for (aut_index, values) in builders.into_iter() { + for (aut_index, values) in builders.into_iter().enumerate() { let mut builder = SdOpBuilder::with_capacity(values.len()); for (rdr_index, value) in values { let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes"); @@ -137,14 +134,14 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> { let start = doc_indexes.len(); builder.$operation().extend_vec(&mut doc_indexes); let len = doc_indexes.len() - start; - if len == 0 { continue } - - let slot = SlotIndexedDocIndexes { - index: aut_index, - start: start, - len: len, - }; - doc_indexes_slots.push(slot); + if len != 0 { + let slot = SlotIndexedDocIndexes { + index: aut_index, + start: start, + len: len, + }; + doc_indexes_slots.push(slot); + } } let read_only = VecReadOnly::new(doc_indexes); diff --git a/raptor/src/metadata/ops_indexed_value.rs b/raptor/src/metadata/ops_indexed_value.rs index 558b57447..2c557f61c 100644 --- a/raptor/src/metadata/ops_indexed_value.rs +++ b/raptor/src/metadata/ops_indexed_value.rs @@ -38,6 +38,12 @@ pub struct UnionIndexedValue<'f> { cur_slot: Option, } +impl<'f> UnionIndexedValue<'f> { + pub fn len(&self) -> usize { + self.heap.num_slots() + } +} + impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -54,7 +60,7 @@ impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { }; self.outs.clear(); self.outs.push(slot.indexed_value()); - while let Some(mut slot2) = self.heap.pop_if_equal(slot.input()) { + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { self.outs.push(slot2.indexed_value()); self.heap.refill(slot2); } diff --git a/raptor/src/rank/mod.rs b/raptor/src/rank/mod.rs index 8b3ef1302..e7a50003d 100644 --- a/raptor/src/rank/mod.rs +++ b/raptor/src/rank/mod.rs @@ -14,7 +14,6 @@ use group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt}; use crate::metadata::Metadata; use crate::metadata::ops::{OpBuilder, Union}; -use crate::metadata::doc_indexes::DocIndexes; use crate::{Match, DocumentId}; use self::{