mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
feat: Replace the HashMap by a simple Vec in stream ops
This commit is contained in:
parent
31a83eae4d
commit
f6a40ed7e4
@ -5,8 +5,8 @@ use std::path::Path;
|
|||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
|
||||||
use fst::raw::MmapReadOnly;
|
use fst::raw::MmapReadOnly;
|
||||||
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
@ -180,9 +180,6 @@ fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>,
|
|||||||
let mut ranges = Vec::with_capacity(cap);
|
let mut ranges = Vec::with_capacity(cap);
|
||||||
let mut values = Vec::with_capacity(number_docs);
|
let mut values = Vec::with_capacity(number_docs);
|
||||||
|
|
||||||
// @Improvement: remove bounds duplications: the left bound of a range
|
|
||||||
// is already the right bound of the previous range,
|
|
||||||
// we could use a slice window of size 2.
|
|
||||||
for v in &vecs {
|
for v in &vecs {
|
||||||
let len = v.len() as u64;
|
let len = v.len() as u64;
|
||||||
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
|
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
|
||||||
|
@ -1,15 +1,13 @@
|
|||||||
// pub mod difference;
|
|
||||||
// pub mod stream_ops;
|
|
||||||
mod ops_indexed_value;
|
mod ops_indexed_value;
|
||||||
pub mod ops;
|
pub mod ops;
|
||||||
pub mod doc_indexes;
|
pub mod doc_indexes;
|
||||||
|
|
||||||
use fst::{Map, MapBuilder};
|
use fst::{Map, MapBuilder};
|
||||||
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
|
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||||
|
|
||||||
pub struct Metadata {
|
pub struct Metadata {
|
||||||
map: Map,
|
map: Map,
|
||||||
@ -87,8 +85,6 @@ impl<W: Write, X: Write> MetadataBuilder<W, X> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::vec_read_only::VecReadOnly;
|
|
||||||
use crate::metadata::ops::IndexedDocIndexes;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn empty_serialize_deserialize() {
|
fn empty_serialize_deserialize() {
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::hash::{Hash, Hasher};
|
use std::collections::BTreeMap;
|
||||||
use std::collections::{HashMap, BTreeMap};
|
|
||||||
use fst::{map, Streamer, Automaton};
|
use fst::{map, Streamer, Automaton};
|
||||||
use fst::automaton::AlwaysMatch;
|
use fst::automaton::AlwaysMatch;
|
||||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||||
@ -9,7 +8,6 @@ use crate::metadata::ops_indexed_value::{
|
|||||||
};
|
};
|
||||||
use crate::metadata::doc_indexes::DocIndexes;
|
use crate::metadata::doc_indexes::DocIndexes;
|
||||||
use crate::metadata::Metadata;
|
use crate::metadata::Metadata;
|
||||||
use crate::automaton::AutomatonExt;
|
|
||||||
use crate::vec_read_only::VecReadOnly;
|
use crate::vec_read_only::VecReadOnly;
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
|
|
||||||
@ -60,19 +58,19 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn union(self) -> Union<'m> {
|
pub fn union(self) -> Union<'m> {
|
||||||
Union::new(self.maps, self.indexes)
|
Union::new(self.maps, self.indexes, self.automatons.len())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn intersection(self) -> Intersection<'m> {
|
pub fn intersection(self) -> Intersection<'m> {
|
||||||
Intersection::new(self.maps, self.indexes)
|
Intersection::new(self.maps, self.indexes, self.automatons.len())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn difference(self) -> Difference<'m> {
|
pub fn difference(self) -> Difference<'m> {
|
||||||
Difference::new(self.maps, self.indexes)
|
Difference::new(self.maps, self.indexes, self.automatons.len())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
||||||
SymmetricDifference::new(self.maps, self.indexes)
|
SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,15 +92,16 @@ macro_rules! logical_operation {
|
|||||||
pub struct $name<'m> {
|
pub struct $name<'m> {
|
||||||
maps: UnionIndexedValue<'m>,
|
maps: UnionIndexedValue<'m>,
|
||||||
indexes: Vec<&'m DocIndexes>,
|
indexes: Vec<&'m DocIndexes>,
|
||||||
|
number_automatons: usize,
|
||||||
outs: Vec<IndexedDocIndexes>,
|
outs: Vec<IndexedDocIndexes>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'m> $name<'m> {
|
impl<'m> $name<'m> {
|
||||||
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>) -> Self
|
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
|
||||||
{
|
|
||||||
$name {
|
$name {
|
||||||
maps: maps.union(),
|
maps: maps.union(),
|
||||||
indexes: indexes,
|
indexes: indexes,
|
||||||
|
number_automatons: number_automatons,
|
||||||
outs: Vec::new(),
|
outs: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -116,17 +115,15 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
|||||||
Some((input, ivalues)) => {
|
Some((input, ivalues)) => {
|
||||||
self.outs.clear();
|
self.outs.clear();
|
||||||
|
|
||||||
// @Improvement: better use a `Vec` instead,
|
let mut builders = vec![BTreeMap::new(); self.number_automatons];
|
||||||
// `aut indexes` follow them selfs
|
|
||||||
let mut builders = HashMap::new();
|
|
||||||
for iv in ivalues {
|
for iv in ivalues {
|
||||||
let builder = builders.entry(iv.aut_index).or_insert_with(BTreeMap::new);
|
let builder = &mut builders[iv.aut_index];
|
||||||
builder.insert(iv.rdr_index, iv.value);
|
builder.insert(iv.rdr_index, iv.value);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut doc_indexes = Vec::new();
|
let mut doc_indexes = Vec::new();
|
||||||
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
|
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
|
||||||
for (aut_index, values) in builders.into_iter() {
|
for (aut_index, values) in builders.into_iter().enumerate() {
|
||||||
let mut builder = SdOpBuilder::with_capacity(values.len());
|
let mut builder = SdOpBuilder::with_capacity(values.len());
|
||||||
for (rdr_index, value) in values {
|
for (rdr_index, value) in values {
|
||||||
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
|
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
|
||||||
@ -137,8 +134,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
|||||||
let start = doc_indexes.len();
|
let start = doc_indexes.len();
|
||||||
builder.$operation().extend_vec(&mut doc_indexes);
|
builder.$operation().extend_vec(&mut doc_indexes);
|
||||||
let len = doc_indexes.len() - start;
|
let len = doc_indexes.len() - start;
|
||||||
if len == 0 { continue }
|
if len != 0 {
|
||||||
|
|
||||||
let slot = SlotIndexedDocIndexes {
|
let slot = SlotIndexedDocIndexes {
|
||||||
index: aut_index,
|
index: aut_index,
|
||||||
start: start,
|
start: start,
|
||||||
@ -146,6 +142,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
|||||||
};
|
};
|
||||||
doc_indexes_slots.push(slot);
|
doc_indexes_slots.push(slot);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let read_only = VecReadOnly::new(doc_indexes);
|
let read_only = VecReadOnly::new(doc_indexes);
|
||||||
self.outs.reserve(doc_indexes_slots.len());
|
self.outs.reserve(doc_indexes_slots.len());
|
||||||
|
@ -38,6 +38,12 @@ pub struct UnionIndexedValue<'f> {
|
|||||||
cur_slot: Option<SlotIndexedValue>,
|
cur_slot: Option<SlotIndexedValue>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'f> UnionIndexedValue<'f> {
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.heap.num_slots()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
|
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
|
||||||
type Item = (&'a [u8], &'a [IndexedValue]);
|
type Item = (&'a [u8], &'a [IndexedValue]);
|
||||||
|
|
||||||
@ -54,7 +60,7 @@ impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
|
|||||||
};
|
};
|
||||||
self.outs.clear();
|
self.outs.clear();
|
||||||
self.outs.push(slot.indexed_value());
|
self.outs.push(slot.indexed_value());
|
||||||
while let Some(mut slot2) = self.heap.pop_if_equal(slot.input()) {
|
while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
|
||||||
self.outs.push(slot2.indexed_value());
|
self.outs.push(slot2.indexed_value());
|
||||||
self.heap.refill(slot2);
|
self.heap.refill(slot2);
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,6 @@ use group_by::GroupByMut;
|
|||||||
use crate::automaton::{DfaExt, AutomatonExt};
|
use crate::automaton::{DfaExt, AutomatonExt};
|
||||||
use crate::metadata::Metadata;
|
use crate::metadata::Metadata;
|
||||||
use crate::metadata::ops::{OpBuilder, Union};
|
use crate::metadata::ops::{OpBuilder, Union};
|
||||||
use crate::metadata::doc_indexes::DocIndexes;
|
|
||||||
use crate::{Match, DocumentId};
|
use crate::{Match, DocumentId};
|
||||||
|
|
||||||
use self::{
|
use self::{
|
||||||
|
Loading…
Reference in New Issue
Block a user