feat: Replace the HashMap by a simple Vec in stream ops

This commit is contained in:
Clément Renault 2018-09-10 19:47:40 +02:00
parent 31a83eae4d
commit f6a40ed7e4
5 changed files with 28 additions and 33 deletions

View File

@ -5,8 +5,8 @@ use std::path::Path;
use std::ops::Deref;
use std::sync::Arc;
use std::mem;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::raw::MmapReadOnly;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use crate::DocIndex;
#[repr(C)]
@ -180,9 +180,6 @@ fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>,
let mut ranges = Vec::with_capacity(cap);
let mut values = Vec::with_capacity(number_docs);
// @Improvement: remove bounds duplications: the left bound of a range
// is already the right bound of the previous range,
// we could use a slice window of size 2.
for v in &vecs {
let len = v.len() as u64;
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);

View File

@ -1,15 +1,13 @@
// pub mod difference;
// pub mod stream_ops;
mod ops_indexed_value;
pub mod ops;
pub mod doc_indexes;
use fst::{Map, MapBuilder};
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
use std::error::Error;
use std::path::Path;
use std::io::Write;
use crate::DocIndex;
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub struct Metadata {
map: Map,
@ -87,8 +85,6 @@ impl<W: Write, X: Write> MetadataBuilder<W, X> {
#[cfg(test)]
mod tests {
use super::*;
use crate::vec_read_only::VecReadOnly;
use crate::metadata::ops::IndexedDocIndexes;
#[test]
fn empty_serialize_deserialize() {

View File

@ -1,5 +1,4 @@
use std::hash::{Hash, Hasher};
use std::collections::{HashMap, BTreeMap};
use std::collections::BTreeMap;
use fst::{map, Streamer, Automaton};
use fst::automaton::AlwaysMatch;
use sdset::multi::OpBuilder as SdOpBuilder;
@ -9,7 +8,6 @@ use crate::metadata::ops_indexed_value::{
};
use crate::metadata::doc_indexes::DocIndexes;
use crate::metadata::Metadata;
use crate::automaton::AutomatonExt;
use crate::vec_read_only::VecReadOnly;
use crate::DocIndex;
@ -60,19 +58,19 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
}
pub fn union(self) -> Union<'m> {
Union::new(self.maps, self.indexes)
Union::new(self.maps, self.indexes, self.automatons.len())
}
pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.maps, self.indexes)
Intersection::new(self.maps, self.indexes, self.automatons.len())
}
pub fn difference(self) -> Difference<'m> {
Difference::new(self.maps, self.indexes)
Difference::new(self.maps, self.indexes, self.automatons.len())
}
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.maps, self.indexes)
SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
}
}
@ -94,15 +92,16 @@ macro_rules! logical_operation {
pub struct $name<'m> {
maps: UnionIndexedValue<'m>,
indexes: Vec<&'m DocIndexes>,
number_automatons: usize,
outs: Vec<IndexedDocIndexes>,
}
impl<'m> $name<'m> {
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>) -> Self
{
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
$name {
maps: maps.union(),
indexes: indexes,
number_automatons: number_automatons,
outs: Vec::new(),
}
}
@ -116,17 +115,15 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
Some((input, ivalues)) => {
self.outs.clear();
// @Improvement: better use a `Vec` instead,
// `aut indexes` follow them selfs
let mut builders = HashMap::new();
let mut builders = vec![BTreeMap::new(); self.number_automatons];
for iv in ivalues {
let builder = builders.entry(iv.aut_index).or_insert_with(BTreeMap::new);
let builder = &mut builders[iv.aut_index];
builder.insert(iv.rdr_index, iv.value);
}
let mut doc_indexes = Vec::new();
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
for (aut_index, values) in builders.into_iter() {
for (aut_index, values) in builders.into_iter().enumerate() {
let mut builder = SdOpBuilder::with_capacity(values.len());
for (rdr_index, value) in values {
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
@ -137,8 +134,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
let start = doc_indexes.len();
builder.$operation().extend_vec(&mut doc_indexes);
let len = doc_indexes.len() - start;
if len == 0 { continue }
if len != 0 {
let slot = SlotIndexedDocIndexes {
index: aut_index,
start: start,
@ -146,6 +142,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
};
doc_indexes_slots.push(slot);
}
}
let read_only = VecReadOnly::new(doc_indexes);
self.outs.reserve(doc_indexes_slots.len());

View File

@ -38,6 +38,12 @@ pub struct UnionIndexedValue<'f> {
cur_slot: Option<SlotIndexedValue>,
}
impl<'f> UnionIndexedValue<'f> {
pub fn len(&self) -> usize {
self.heap.num_slots()
}
}
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
type Item = (&'a [u8], &'a [IndexedValue]);
@ -54,7 +60,7 @@ impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
};
self.outs.clear();
self.outs.push(slot.indexed_value());
while let Some(mut slot2) = self.heap.pop_if_equal(slot.input()) {
while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
self.outs.push(slot2.indexed_value());
self.heap.refill(slot2);
}

View File

@ -14,7 +14,6 @@ use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::Metadata;
use crate::metadata::ops::{OpBuilder, Union};
use crate::metadata::doc_indexes::DocIndexes;
use crate::{Match, DocumentId};
use self::{