From cc52d5dda54ed7feef5af0c27b13eaf0fea3d755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 8 Nov 2018 12:05:59 +0100 Subject: [PATCH] feat: Working on ops for Positive and Negative blobs --- Cargo.toml | 1 + src/blob/merge.rs | 41 ++-- src/blob/mod.rs | 34 +-- src/blob/negative_blob.rs | 73 +++---- src/blob/ops.rs | 37 ++-- src/blob/positive_blob.rs | 2 +- src/data/doc_ids.rs | 72 +++++++ src/{ => data}/doc_indexes.rs | 38 +--- src/data/mod.rs | 33 +++ src/index.rs | 24 ++- src/lib.rs | 6 +- src/metadata/difference.rs | 126 ------------ src/metadata/doc_indexes.rs | 200 ------------------ src/metadata/mod.rs | 136 ------------ src/metadata/ops.rs | 329 ------------------------------ src/metadata/ops_indexed_value.rs | 203 ------------------ src/metadata/stream_ops.rs | 309 ---------------------------- src/pentium.rs | 28 --- 18 files changed, 213 insertions(+), 1479 deletions(-) create mode 100644 src/data/doc_ids.rs rename src/{ => data}/doc_indexes.rs (85%) create mode 100644 src/data/mod.rs delete mode 100644 src/metadata/difference.rs delete mode 100644 src/metadata/doc_indexes.rs delete mode 100644 src/metadata/mod.rs delete mode 100644 src/metadata/ops.rs delete mode 100644 src/metadata/ops_indexed_value.rs delete mode 100644 src/metadata/stream_ops.rs delete mode 100644 src/pentium.rs diff --git a/Cargo.toml b/Cargo.toml index 79e5e3a14..40be43633 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ authors = ["Kerollmops "] byteorder = "1.2" lazy_static = "1.1" sdset = "0.2" +fs2 = "0.4" fnv = "1.0" [dependencies.fst] diff --git a/src/blob/merge.rs b/src/blob/merge.rs index 5a9c88d6f..e21398587 100644 --- a/src/blob/merge.rs +++ b/src/blob/merge.rs @@ -288,18 +288,13 @@ mod tests { #[test] fn single_negative_blob() { - let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; - let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; - let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; - let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; - let a = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc1); - builder.insert("hell", doc2); - builder.insert("hello", doc3); - builder.insert("wor", doc4); + builder.insert(1); + builder.insert(2); + builder.insert(3); + builder.insert(4); Blob::Negative(builder.build().unwrap()) }; @@ -371,10 +366,10 @@ mod tests { }; let b = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc2); - builder.insert("hello", doc3); + builder.insert(2); + builder.insert(3); Blob::Negative(builder.build().unwrap()) }; @@ -410,10 +405,10 @@ mod tests { }; let b = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc1); - builder.insert("wor", doc4); + builder.insert(1); + builder.insert(4); Blob::Negative(builder.build().unwrap()) }; @@ -428,9 +423,9 @@ mod tests { }; let d = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc1); + builder.insert(1); Blob::Negative(builder.build().unwrap()) }; @@ -478,18 +473,18 @@ mod tests { }; let c = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc1); - builder.insert("wor", doc4); + builder.insert(1); + builder.insert(4); Blob::Negative(builder.build().unwrap()) }; let d = { - let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + let mut builder = NegativeBlobBuilder::new(Vec::new()); - builder.insert("hell", doc1); + builder.insert(1); Blob::Negative(builder.build().unwrap()) }; diff --git a/src/blob/mod.rs b/src/blob/mod.rs index 00a36281f..0139f48d1 100644 --- a/src/blob/mod.rs +++ b/src/blob/mod.rs @@ -10,13 +10,22 @@ pub use self::negative_blob::{NegativeBlob, NegativeBlobBuilder}; use fst::Map; -use crate::doc_indexes::DocIndexes; +use crate::data::DocIndexes; pub enum Blob { Positive(PositiveBlob), Negative(NegativeBlob), } +impl Blob { + pub fn sign(&self) -> Sign { + match self { + Blob::Positive(_) => Sign::Positive, + Blob::Negative(_) => Sign::Negative, + } + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Sign { Positive, @@ -31,26 +40,3 @@ impl Sign { } } } - -impl Blob { - pub fn sign(&self) -> Sign { - match self { - Blob::Positive(_) => Sign::Positive, - Blob::Negative(_) => Sign::Negative, - } - } - - pub fn as_map(&self) -> &Map { - match self { - Blob::Positive(blob) => blob.as_map(), - Blob::Negative(blob) => blob.as_map(), - } - } - - pub fn as_indexes(&self) -> &DocIndexes { - match self { - Blob::Positive(blob) => blob.as_indexes(), - Blob::Negative(blob) => blob.as_indexes(), - } - } -} diff --git a/src/blob/negative_blob.rs b/src/blob/negative_blob.rs index 2ae411984..ca8679c81 100644 --- a/src/blob/negative_blob.rs +++ b/src/blob/negative_blob.rs @@ -2,86 +2,61 @@ use std::error::Error; use std::path::Path; use std::io::Write; -use fst::{Map, MapBuilder}; - -use crate::DocIndex; -use crate::doc_indexes::{DocIndexes, DocIndexesBuilder}; +use crate::DocumentId; +use crate::data::{DocIds, DocIdsBuilder}; pub struct NegativeBlob { - map: Map, - indexes: DocIndexes, + doc_ids: DocIds, } impl NegativeBlob { - pub unsafe fn from_paths(map: P, indexes: Q) -> Result> + pub unsafe fn from_path

(doc_ids: P) -> Result> where P: AsRef, - Q: AsRef, { - let map = Map::from_path(map)?; - let indexes = DocIndexes::from_path(indexes)?; - Ok(NegativeBlob { map, indexes }) + let doc_ids = DocIds::from_path(doc_ids)?; + Ok(NegativeBlob { doc_ids }) } - pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { - let map = Map::from_bytes(map)?; - let indexes = DocIndexes::from_bytes(indexes)?; - Ok(NegativeBlob { map, indexes }) + pub fn from_bytes(doc_ids: Vec) -> Result> { + let doc_ids = DocIds::from_bytes(doc_ids)?; + Ok(NegativeBlob { doc_ids }) } - pub fn get>(&self, key: K) -> Option<&[DocIndex]> { - self.map.get(key).and_then(|index| self.indexes.get(index)) + pub fn as_ids(&self) -> &DocIds { + &self.doc_ids } - pub fn as_map(&self) -> &Map { - &self.map - } - - pub fn as_indexes(&self) -> &DocIndexes { - &self.indexes - } - - pub fn explode(self) -> (Map, DocIndexes) { - (self.map, self.indexes) + pub fn into_doc_ids(self) -> DocIds { + self.doc_ids } } -pub struct NegativeBlobBuilder { - map: W, - indexes: DocIndexesBuilder, +pub struct NegativeBlobBuilder { + doc_ids: DocIdsBuilder, } -impl NegativeBlobBuilder { - pub fn new(map: W, indexes: X) -> Self { - Self { map, indexes: DocIndexesBuilder::new(indexes) } +impl NegativeBlobBuilder { + pub fn new(wrt: W) -> Self { + Self { doc_ids: DocIdsBuilder::new(wrt) } } - pub fn insert>(&mut self, key: S, index: DocIndex) { - self.indexes.insert(key.into(), index) + pub fn insert(&mut self, doc: DocumentId) { + self.doc_ids.insert(doc) } pub fn finish(self) -> Result<(), Box> { self.into_inner().map(|_| ()) } - pub fn into_inner(self) -> Result<(W, X), Box> { + pub fn into_inner(self) -> Result> { // FIXME insert a magic number that indicates if the endianess // of the input is the same as the machine that is reading it. - - let map = { - let mut keys_builder = MapBuilder::new(self.map)?; - let keys = self.indexes.keys().map(|(s, v)| (s, *v)); - keys_builder.extend_iter(keys)?; - keys_builder.into_inner()? - }; - - let indexes = self.indexes.into_inner()?; - - Ok((map, indexes)) + Ok(self.doc_ids.into_inner()?) } } -impl NegativeBlobBuilder, Vec> { +impl NegativeBlobBuilder> { pub fn build(self) -> Result> { - self.into_inner().and_then(|(m, i)| NegativeBlob::from_bytes(m, i)) + self.into_inner().and_then(|ids| NegativeBlob::from_bytes(ids)) } } diff --git a/src/blob/ops.rs b/src/blob/ops.rs index dbd143076..f4d4fa1da 100644 --- a/src/blob/ops.rs +++ b/src/blob/ops.rs @@ -9,7 +9,7 @@ use crate::blob::ops_indexed_value::{ OpIndexedValueBuilder, UnionIndexedValue, }; use crate::blob::Blob; -use crate::doc_indexes::DocIndexes; +use crate::data::DocIndexes; use crate::vec_read_only::VecReadOnly; use crate::DocIndex; @@ -40,23 +40,34 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> { } } - pub fn add(mut self, blob: &'m Blob) -> Self where A: Clone { + pub fn add(mut self, blob: &'m Blob) -> Self + where A: Clone + { self.push(blob); self } - pub fn push(&mut self, blob: &'m Blob) where A: Clone { - let mut op = map::OpBuilder::new(); - for automaton in self.automatons.iter().cloned() { - let stream = blob.as_map().search(automaton); - op.push(stream); + pub fn push(&mut self, blob: &'m Blob) + where A: Clone + { + match blob { + Blob::Positive(blob) => { + let mut op = map::OpBuilder::new(); + for automaton in self.automatons.iter().cloned() { + let stream = blob.as_map().search(automaton); + op.push(stream); + } + + let stream = op.union(); + let indexes = blob.as_indexes(); + + self.maps.push(stream); + self.indexes.push(indexes); + }, + Blob::Negative(blob) => { + unimplemented!() + }, } - - let stream = op.union(); - let indexes = blob.as_indexes(); - - self.maps.push(stream); - self.indexes.push(indexes); } pub fn union(self) -> Union<'m> { diff --git a/src/blob/positive_blob.rs b/src/blob/positive_blob.rs index 0d0b74c59..ac1e85d46 100644 --- a/src/blob/positive_blob.rs +++ b/src/blob/positive_blob.rs @@ -5,7 +5,7 @@ use std::io::Write; use fst::{Map, MapBuilder}; use crate::DocIndex; -use crate::doc_indexes::{DocIndexes, DocIndexesBuilder}; +use crate::data::{DocIndexes, DocIndexesBuilder}; pub struct PositiveBlob { map: Map, diff --git a/src/data/doc_ids.rs b/src/data/doc_ids.rs new file mode 100644 index 000000000..c2c3738a4 --- /dev/null +++ b/src/data/doc_ids.rs @@ -0,0 +1,72 @@ +use std::collections::BTreeSet; +use std::slice::from_raw_parts; +use std::error::Error; +use std::path::Path; +use std::sync::Arc; +use std::{io, mem}; + +use byteorder::{NativeEndian, WriteBytesExt}; +use fst::raw::MmapReadOnly; + +use crate::DocumentId; +use crate::data::Data; + +#[derive(Clone)] +pub struct DocIds { + doc_ids: Data, +} + +impl DocIds { + pub unsafe fn from_path>(path: P) -> io::Result { + let mmap = MmapReadOnly::open_path(path)?; + let doc_ids = Data::Mmap(mmap); + Ok(DocIds { doc_ids }) + } + + pub fn from_bytes(vec: Vec) -> io::Result { + let len = vec.len(); + let doc_ids = Data::Shared { + vec: Arc::new(vec), + offset: 0, + len: len + }; + Ok(DocIds { doc_ids }) + } + + pub fn contains(&self, doc: DocumentId) -> bool { + // FIXME prefer using the sdset::exponential_search function + self.doc_ids().binary_search(&doc).is_ok() + } + + pub fn doc_ids(&self) -> &[DocumentId] { + let slice = &self.doc_ids; + let ptr = slice.as_ptr() as *const DocumentId; + let len = slice.len() / mem::size_of::(); + unsafe { from_raw_parts(ptr, len) } + } +} + +pub struct DocIdsBuilder { + doc_ids: BTreeSet, + wrt: W, +} + +impl DocIdsBuilder { + pub fn new(wrt: W) -> Self { + Self { + doc_ids: BTreeSet::new(), + wrt: wrt, + } + } + + pub fn insert(&mut self, doc: DocumentId) { + self.doc_ids.insert(doc); + } + + pub fn into_inner(mut self) -> io::Result { + for id in self.doc_ids { + self.wrt.write_u64::(id)?; + } + Ok(self.wrt) + } +} diff --git a/src/doc_indexes.rs b/src/data/doc_indexes.rs similarity index 85% rename from src/doc_indexes.rs rename to src/data/doc_indexes.rs index 5aef15baa..bb0cea837 100644 --- a/src/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -5,9 +5,12 @@ use std::path::Path; use std::ops::Deref; use std::sync::Arc; use std::mem; + use fst::raw::MmapReadOnly; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; + use crate::DocIndex; +use crate::data::Data; #[repr(C)] struct Range { @@ -15,33 +18,10 @@ struct Range { end: u64, } -#[derive(Clone)] -enum DocIndexesData { - Shared { - vec: Arc>, - offset: usize, - len: usize, - }, - Mmap(MmapReadOnly), -} - -impl Deref for DocIndexesData { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - match self { - DocIndexesData::Shared { vec, offset, len } => { - &vec[*offset..offset + len] - }, - DocIndexesData::Mmap(m) => m.as_slice(), - } - } -} - #[derive(Clone)] pub struct DocIndexes { - ranges: DocIndexesData, - indexes: DocIndexesData, + ranges: Data, + indexes: Data, } impl DocIndexes { @@ -52,11 +32,11 @@ impl DocIndexes { let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); + let ranges = Data::Mmap(mmap.range(offset, range_len)); let len = mmap.len() - range_len - offset; let offset = offset + range_len; - let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); + let indexes = Data::Mmap(mmap.range(offset, len)); Ok(DocIndexes { ranges, indexes }) } @@ -68,7 +48,7 @@ impl DocIndexes { let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Shared { + let ranges = Data::Shared { vec: vec.clone(), offset, len: range_len @@ -76,7 +56,7 @@ impl DocIndexes { let len = vec.len() - range_len - offset; let offset = offset + range_len; - let indexes = DocIndexesData::Shared { vec, offset, len }; + let indexes = Data::Shared { vec, offset, len }; Ok(DocIndexes { ranges, indexes }) } diff --git a/src/data/mod.rs b/src/data/mod.rs new file mode 100644 index 000000000..0f16621fb --- /dev/null +++ b/src/data/mod.rs @@ -0,0 +1,33 @@ +mod doc_ids; +mod doc_indexes; + +use std::ops::Deref; +use std::sync::Arc; + +use fst::raw::MmapReadOnly; + +pub use self::doc_ids::{DocIds, DocIdsBuilder}; +pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; + +#[derive(Clone)] +enum Data { + Shared { + vec: Arc>, + offset: usize, + len: usize, + }, + Mmap(MmapReadOnly), +} + +impl Deref for Data { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + Data::Shared { vec, offset, len } => { + &vec[*offset..offset + len] + }, + Data::Mmap(m) => m.as_slice(), + } + } +} diff --git a/src/index.rs b/src/index.rs index e3431e5fe..41e0ada03 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,21 +1,37 @@ use std::path::{Path, PathBuf}; use std::error::Error; +use std::fs::{self, File}; + +use fs2::FileExt; use crate::rank::Document; use crate::blob::Blob; pub struct Index { path: PathBuf, + lock_file: File, blobs: Vec, } impl Index { - pub fn open(path: &Path) -> Result> { - unimplemented!() + pub fn open>(path: P) -> Result> { + let path = path.into(); + + let lock_file = File::create(path.join(".lock"))?; + lock_file.try_lock_exclusive()?; + + let blobs = Vec::new(); + + Ok(Self { path, lock_file, blobs }) } - pub fn create(path: &Path) -> Result> { - unimplemented!() + pub fn create>(path: P) -> Result> { + let path = path.into(); + + fs::create_dir_all(&path)?; + File::create(path.join(".lock"))?; + + Self::open(path) } pub fn blobs(&self) -> &[Blob] { diff --git a/src/lib.rs b/src/lib.rs index 26dc886f8..fe1e19dfa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,18 +3,14 @@ #[macro_use] extern crate lazy_static; pub mod index; -pub mod pentium; pub mod blob; -pub mod doc_indexes; - +pub mod data; pub mod rank; -pub mod metadata; pub mod vec_read_only; pub mod automaton; pub mod tokenizer; mod common_words; -pub use self::metadata::{Metadata, MetadataBuilder}; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; diff --git a/src/metadata/difference.rs b/src/metadata/difference.rs deleted file mode 100644 index 6e71d57d1..000000000 --- a/src/metadata/difference.rs +++ /dev/null @@ -1,126 +0,0 @@ -use fst::{Streamer, Automaton}; -use crate::metadata::ops::{self, IndexedDocIndexes}; -use crate::metadata::{stream_ops, Metadata}; - -fn union_with_automatons<'a, A>(metas: &'a [Metadata], autos: Vec) -> ops::Union -where A: 'a + Automaton + Clone, -{ - let mut op = ops::OpBuilder::with_automatons(autos); - for metadata in metas { - op.push(metadata); - } - op.union() -} - -pub struct Difference<'f> { - inner: stream_ops::Difference<'f>, -} - -impl<'f> Difference<'f> { - pub fn new(positives: &'f [Metadata], negatives: &'f [Metadata], automatons: Vec) -> Self - where A: 'f + Automaton + Clone - { - let positives = union_with_automatons(positives, automatons.clone()); - let negatives = union_with_automatons(negatives, automatons); - - let mut builder = stream_ops::OpBuilder::new(); - builder.push(positives); - builder.push(negatives); - - Difference { inner: builder.difference() } - } -} - -impl<'a, 'f> Streamer<'a> for Difference<'f> { - type Item = (&'a [u8], &'a [IndexedDocIndexes]); - - fn next(&'a mut self) -> Option { - self.inner.next() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use fst::automaton::AlwaysMatch; - use crate::metadata::{Metadata, MetadataBuilder}; - use crate::vec_read_only::VecReadOnly; - use crate::DocIndex; - - fn construct_metadata(documents: Vec<(String, DocIndex)>) -> Metadata { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let mut builder = MetadataBuilder::new(mapw, indexesw); - - for (string, doc_index) in documents { - builder.insert(string, doc_index); - } - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - } - - #[test] - fn empty() { - let positive_metas = construct_metadata(vec![ - ("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }), - ("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }), - ]); - - let negative_metas = construct_metadata(vec![ - ("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }), - ("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }), - ]); - - let positives = &[positive_metas]; - let negatives = &[negative_metas]; - let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]); - - assert_eq!(diff.next(), None); - } - - #[test] - fn one_positive() { - let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }; - let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }; - - let positive_metas = construct_metadata(vec![ - ("chameau".into(), di1), - ("chameau".into(), di2), - ]); - - let negative_metas = construct_metadata(vec![ - ("chameau".into(), di1), - ]); - - let positives = &[positive_metas]; - let negatives = &[negative_metas]; - let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]); - - let idi = IndexedDocIndexes{ index: 0, doc_indexes: VecReadOnly::new(vec![di2]) }; - assert_eq!(diff.next(), Some(("chameau".as_bytes(), &[idi][..]))); - assert_eq!(diff.next(), None); - } - - #[test] - fn more_negative_than_positive() { - let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }; - let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }; - - let positive_metas = construct_metadata(vec![ - ("chameau".into(), di1), - ]); - - let negative_metas = construct_metadata(vec![ - ("chameau".into(), di1), - ("chameau".into(), di2), - ]); - - let positives = &[positive_metas]; - let negatives = &[negative_metas]; - let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]); - - assert_eq!(diff.next(), None); - } -} diff --git a/src/metadata/doc_indexes.rs b/src/metadata/doc_indexes.rs deleted file mode 100644 index 5aef15baa..000000000 --- a/src/metadata/doc_indexes.rs +++ /dev/null @@ -1,200 +0,0 @@ -use std::collections::btree_map::{BTreeMap, Iter, Entry}; -use std::slice::from_raw_parts; -use std::io::{self, Write}; -use std::path::Path; -use std::ops::Deref; -use std::sync::Arc; -use std::mem; -use fst::raw::MmapReadOnly; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use crate::DocIndex; - -#[repr(C)] -struct Range { - start: u64, - end: u64, -} - -#[derive(Clone)] -enum DocIndexesData { - Shared { - vec: Arc>, - offset: usize, - len: usize, - }, - Mmap(MmapReadOnly), -} - -impl Deref for DocIndexesData { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - match self { - DocIndexesData::Shared { vec, offset, len } => { - &vec[*offset..offset + len] - }, - DocIndexesData::Mmap(m) => m.as_slice(), - } - } -} - -#[derive(Clone)] -pub struct DocIndexes { - ranges: DocIndexesData, - indexes: DocIndexesData, -} - -impl DocIndexes { - pub unsafe fn from_path>(path: P) -> io::Result { - let mmap = MmapReadOnly::open_path(path)?; - - let range_len = mmap.as_slice().read_u64::()?; - let range_len = range_len as usize * mem::size_of::(); - - let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); - - let len = mmap.len() - range_len - offset; - let offset = offset + range_len; - let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); - - Ok(DocIndexes { ranges, indexes }) - } - - pub fn from_bytes(vec: Vec) -> io::Result { - let vec = Arc::new(vec); - - let range_len = vec.as_slice().read_u64::()?; - let range_len = range_len as usize * mem::size_of::(); - - let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Shared { - vec: vec.clone(), - offset, - len: range_len - }; - - let len = vec.len() - range_len - offset; - let offset = offset + range_len; - let indexes = DocIndexesData::Shared { vec, offset, len }; - - Ok(DocIndexes { ranges, indexes }) - } - - pub fn get(&self, index: u64) -> Option<&[DocIndex]> { - self.ranges().get(index as usize).map(|Range { start, end }| { - let start = *start as usize; - let end = *end as usize; - &self.indexes()[start..end] - }) - } - - fn ranges(&self) -> &[Range] { - let slice = &self.ranges; - let ptr = slice.as_ptr() as *const Range; - let len = slice.len() / mem::size_of::(); - unsafe { from_raw_parts(ptr, len) } - } - - fn indexes(&self) -> &[DocIndex] { - let slice = &self.indexes; - let ptr = slice.as_ptr() as *const DocIndex; - let len = slice.len() / mem::size_of::(); - unsafe { from_raw_parts(ptr, len) } - } -} - -pub struct DocIndexesBuilder { - keys: BTreeMap, - indexes: Vec>, - number_docs: usize, - wtr: W, -} - -impl DocIndexesBuilder { - pub fn new(wtr: W) -> Self { - Self { - keys: BTreeMap::new(), - indexes: Vec::new(), - number_docs: 0, - wtr: wtr, - } - } - - pub fn number_doc_indexes(&self) -> usize { - self.number_docs - } - - pub fn insert(&mut self, key: String, value: DocIndex) { - match self.keys.entry(key) { - Entry::Vacant(e) => { - let index = self.indexes.len() as u64; - self.indexes.push(vec![value]); - e.insert(index); - }, - Entry::Occupied(e) => { - let index = *e.get(); - let vec = &mut self.indexes[index as usize]; - vec.push(value); - }, - } - self.number_docs += 1; - } - - pub fn keys(&self) -> Iter { - self.keys.iter() - } - - pub fn finish(self) -> io::Result<()> { - self.into_inner().map(|_| ()) - } - - pub fn into_inner(mut self) -> io::Result { - - for vec in &mut self.indexes { - vec.sort_unstable(); - } - - let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); - let len = ranges.len() as u64; - - // TODO check if this is correct - self.wtr.write_u64::(len)?; - unsafe { - // write Ranges first - let slice = into_u8_slice(ranges.as_slice()); - self.wtr.write_all(slice)?; - - // write Values after - let slice = into_u8_slice(values.as_slice()); - self.wtr.write_all(slice)?; - } - - self.wtr.flush()?; - Ok(self.wtr) - } -} - -fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { - let cap = vecs.len(); - let mut ranges = Vec::with_capacity(cap); - let mut values = Vec::with_capacity(number_docs); - - for v in &vecs { - let len = v.len() as u64; - let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); - - let range = Range { start, end: start + len }; - ranges.push(range); - } - - values.extend(vecs.into_iter().flatten()); - - (ranges, values) -} - -unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { - let ptr = slice.as_ptr() as *const u8; - let len = slice.len() * mem::size_of::(); - from_raw_parts(ptr, len) -} diff --git a/src/metadata/mod.rs b/src/metadata/mod.rs deleted file mode 100644 index a01d48bd7..000000000 --- a/src/metadata/mod.rs +++ /dev/null @@ -1,136 +0,0 @@ -pub mod ops; -pub mod stream_ops; -pub mod doc_indexes; -pub mod difference; -pub mod ops_indexed_value; - -use fst::{Map, MapBuilder}; -use std::error::Error; -use std::path::Path; -use std::io::Write; -use crate::DocIndex; -use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; - -pub struct Metadata { - map: Map, - indexes: DocIndexes, -} - -impl Metadata { - pub unsafe fn from_paths(map: P, indexes: Q) -> Result> - where P: AsRef, - Q: AsRef, - { - let map = Map::from_path(map)?; - let indexes = DocIndexes::from_path(indexes)?; - Ok(Metadata { map, indexes }) - } - - pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { - let map = Map::from_bytes(map)?; - let indexes = DocIndexes::from_bytes(indexes)?; - Ok(Metadata { map, indexes }) - } - - pub fn get>(&self, key: K) -> Option<&[DocIndex]> { - self.map.get(key).and_then(|index| self.indexes.get(index)) - } - - pub fn as_map(&self) -> &Map { - &self.map - } - - pub fn as_indexes(&self) -> &DocIndexes { - &self.indexes - } - - pub fn explode(self) -> (Map, DocIndexes) { - (self.map, self.indexes) - } -} - -pub struct MetadataBuilder { - map: W, - indexes: DocIndexesBuilder, -} - -impl MetadataBuilder { - pub fn new(map: W, indexes: X) -> Self { - Self { map, indexes: DocIndexesBuilder::new(indexes) } - } - - pub fn insert(&mut self, key: String, index: DocIndex) { - self.indexes.insert(key, index) - } - - pub fn finish(self) -> Result<(), Box> { - self.into_inner().map(|_| ()) - } - - pub fn into_inner(self) -> Result<(W, X), Box> { - // FIXME insert a magic number that indicates if the endianess - // of the input is the same as the machine that is reading it. - - let map = { - let mut keys_builder = MapBuilder::new(self.map)?; - let keys = self.indexes.keys().map(|(s, v)| (s, *v)); - keys_builder.extend_iter(keys)?; - keys_builder.into_inner()? - }; - - let indexes = self.indexes.into_inner()?; - - Ok((map, indexes)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn empty_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let builder = MetadataBuilder::new(mapw, indexesw); - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), None); - } - - #[test] - fn one_doc_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let mut builder = MetadataBuilder::new(mapw, indexesw); - - let doc = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; - builder.insert("chameau".into(), doc); - - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), Some(&[doc][..])); - } - - #[test] - fn multiple_docs_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let mut builder = MetadataBuilder::new(mapw, indexesw); - - let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - builder.insert("chameau".into(), doc1); - builder.insert("chameau".into(), doc2); - - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..])); - } -} diff --git a/src/metadata/ops.rs b/src/metadata/ops.rs deleted file mode 100644 index a0d48773b..000000000 --- a/src/metadata/ops.rs +++ /dev/null @@ -1,329 +0,0 @@ -use std::collections::BTreeMap; -use fst::{map, Streamer, Automaton}; -use fst::automaton::AlwaysMatch; -use sdset::multi::OpBuilder as SdOpBuilder; -use sdset::{SetOperation, Set}; -use crate::metadata::ops_indexed_value::{ - OpIndexedValueBuilder, UnionIndexedValue, -}; -use crate::metadata::doc_indexes::DocIndexes; -use crate::metadata::Metadata; -use crate::vec_read_only::VecReadOnly; -use crate::DocIndex; - -pub struct OpBuilder<'m, A: Automaton> { - // the operation on the maps is always an union. - maps: OpIndexedValueBuilder<'m>, - automatons: Vec, - indexes: Vec<&'m DocIndexes>, -} - -impl<'m> OpBuilder<'m, AlwaysMatch> { - pub fn new() -> Self { - Self { - maps: OpIndexedValueBuilder::new(), - automatons: vec![AlwaysMatch], - indexes: Vec::new(), - } - } -} - -/// Do a set operation on multiple maps with the same automatons. -impl<'m, A: 'm + Automaton> OpBuilder<'m, A> { - pub fn with_automatons(automatons: Vec) -> Self { - Self { - maps: OpIndexedValueBuilder::new(), - automatons: automatons, - indexes: Vec::new(), - } - } - - pub fn add(mut self, metadata: &'m Metadata) -> Self where A: Clone { - self.push(metadata); - self - } - - pub fn push(&mut self, metadata: &'m Metadata) where A: Clone { - let mut op = map::OpBuilder::new(); - for automaton in self.automatons.iter().cloned() { - let stream = metadata.as_map().search(automaton); - op.push(stream); - } - - let stream = op.union(); - let indexes = metadata.as_indexes(); - - self.maps.push(stream); - self.indexes.push(indexes); - } - - pub fn union(self) -> Union<'m> { - Union::new(self.maps, self.indexes, self.automatons.len()) - } - - pub fn intersection(self) -> Intersection<'m> { - Intersection::new(self.maps, self.indexes, self.automatons.len()) - } - - pub fn difference(self) -> Difference<'m> { - Difference::new(self.maps, self.indexes, self.automatons.len()) - } - - pub fn symmetric_difference(self) -> SymmetricDifference<'m> { - SymmetricDifference::new(self.maps, self.indexes, self.automatons.len()) - } -} - -#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct IndexedDocIndexes { - pub index: usize, - pub doc_indexes: VecReadOnly, -} - -struct SlotIndexedDocIndexes { - index: usize, - start: usize, - len: usize, -} - -macro_rules! logical_operation { - (struct $name:ident, $operation:ident) => { - -pub struct $name<'m> { - maps: UnionIndexedValue<'m>, - indexes: Vec<&'m DocIndexes>, - number_automatons: usize, - outs: Vec, -} - -impl<'m> $name<'m> { - fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self { - $name { - maps: maps.union(), - indexes: indexes, - number_automatons: number_automatons, - outs: Vec::new(), - } - } -} - -impl<'m, 'a> fst::Streamer<'a> for $name<'m> { - type Item = (&'a [u8], &'a [IndexedDocIndexes]); - - fn next(&'a mut self) -> Option { - match self.maps.next() { - Some((input, ivalues)) => { - self.outs.clear(); - - let mut builders = vec![BTreeMap::new(); self.number_automatons]; - for iv in ivalues { - let builder = &mut builders[iv.aut_index]; - builder.insert(iv.rdr_index, iv.value); - } - - let mut doc_indexes = Vec::new(); - let mut doc_indexes_slots = Vec::with_capacity(builders.len()); - for (aut_index, values) in builders.into_iter().enumerate() { - let mut builder = SdOpBuilder::with_capacity(values.len()); - for (rdr_index, value) in values { - let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes"); - let indexes = Set::new_unchecked(indexes); - builder.push(indexes); - } - - let start = doc_indexes.len(); - builder.$operation().extend_vec(&mut doc_indexes); - let len = doc_indexes.len() - start; - if len != 0 { - let slot = SlotIndexedDocIndexes { - index: aut_index, - start: start, - len: len, - }; - doc_indexes_slots.push(slot); - } - } - - let read_only = VecReadOnly::new(doc_indexes); - self.outs.reserve(doc_indexes_slots.len()); - for slot in doc_indexes_slots { - let indexes = IndexedDocIndexes { - index: slot.index, - doc_indexes: read_only.range(slot.start, slot.len), - }; - self.outs.push(indexes); - } - - if self.outs.is_empty() { return None } - Some((input, &self.outs)) - }, - None => None, - } - } -} -}} - -logical_operation!(struct Union, union); -logical_operation!(struct Intersection, intersection); -logical_operation!(struct Difference, difference); -logical_operation!(struct SymmetricDifference, symmetric_difference); - -#[cfg(test)] -mod tests { - use super::*; - use crate::metadata::MetadataBuilder; - - fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option> - where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>, - S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>, - { - let mut stream = stream.into_stream(); - while let Some((string, indexes)) = stream.next() { - if string == key { - return Some(indexes[0].doc_indexes.clone()) - } - } - None - } - - #[test] - fn union_two_metadata() { - let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - - let meta1 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc1); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let meta2 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc2); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let metas = OpBuilder::new().add(&meta1).add(&meta2).union(); - let value = get_exact_key(metas, b"chameau"); - - assert_eq!(&*value.unwrap(), &[doc1, doc2][..]); - } - - #[test] - fn intersection_two_metadata() { - let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - - let meta1 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc1); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let meta2 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc2); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection(); - let value = get_exact_key(metas, b"chameau"); - - assert_eq!(&*value.unwrap(), &[doc1][..]); - } - - #[test] - fn difference_two_metadata() { - let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - - let meta1 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc1); - builder.insert("chameau".into(), doc2); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let meta2 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc3); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let metas = OpBuilder::new().add(&meta1).add(&meta2).difference(); - let value = get_exact_key(metas, b"chameau"); - - assert_eq!(&*value.unwrap(), &[doc1][..]); - } - - #[test] - fn symmetric_difference_two_metadata() { - let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; - let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 }; - let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 }; - - let meta1 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc1); - builder.insert("chameau".into(), doc2); - builder.insert("chameau".into(), doc3); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let meta2 = { - let mapw = Vec::new(); - let indexesw = Vec::new(); - let mut builder = MetadataBuilder::new(mapw, indexesw); - - builder.insert("chameau".into(), doc2); - builder.insert("chameau".into(), doc3); - builder.insert("chameau".into(), doc4); - - let (map, indexes) = builder.into_inner().unwrap(); - Metadata::from_bytes(map, indexes).unwrap() - }; - - let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference(); - let value = get_exact_key(metas, b"chameau"); - - assert_eq!(&*value.unwrap(), &[doc1, doc4][..]); - } -} diff --git a/src/metadata/ops_indexed_value.rs b/src/metadata/ops_indexed_value.rs deleted file mode 100644 index 2c557f61c..000000000 --- a/src/metadata/ops_indexed_value.rs +++ /dev/null @@ -1,203 +0,0 @@ -use std::collections::BinaryHeap; -use std::rc::Rc; -use std::cmp; -use fst::raw::{self, Output}; -use fst::{self, IntoStreamer, Streamer}; - -type BoxedStream<'f> = Box Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>; - -pub struct OpIndexedValueBuilder<'f> { - streams: Vec>, -} - -impl<'f> OpIndexedValueBuilder<'f> { - pub fn new() -> Self { - Self { streams: Vec::new() } - } - - pub fn push(&mut self, stream: I) - where - I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>, - { - self.streams.push(Box::new(stream.into_stream())); - } - - pub fn union(self) -> UnionIndexedValue<'f> { - UnionIndexedValue { - heap: StreamIndexedValueHeap::new(self.streams), - outs: Vec::new(), - cur_slot: None, - } - } -} - -pub struct UnionIndexedValue<'f> { - heap: StreamIndexedValueHeap<'f>, - outs: Vec, - cur_slot: Option, -} - -impl<'f> UnionIndexedValue<'f> { - pub fn len(&self) -> usize { - self.heap.num_slots() - } -} - -impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { - type Item = (&'a [u8], &'a [IndexedValue]); - - fn next(&'a mut self) -> Option { - if let Some(slot) = self.cur_slot.take() { - self.heap.refill(slot); - } - let slot = match self.heap.pop() { - None => return None, - Some(slot) => { - self.cur_slot = Some(slot); - self.cur_slot.as_mut().unwrap() - } - }; - self.outs.clear(); - self.outs.push(slot.indexed_value()); - while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { - self.outs.push(slot2.indexed_value()); - self.heap.refill(slot2); - } - Some((slot.input(), &self.outs)) - } -} - -struct StreamIndexedValueHeap<'f> { - rdrs: Vec>, - heap: BinaryHeap, -} - -impl<'f> StreamIndexedValueHeap<'f> { - fn new(streams: Vec>) -> StreamIndexedValueHeap<'f> { - let mut u = StreamIndexedValueHeap { - rdrs: streams, - heap: BinaryHeap::new(), - }; - for i in 0..u.rdrs.len() { - u.refill(SlotIndexedValue::new(i)); - } - u - } - - fn pop(&mut self) -> Option { - self.heap.pop() - } - - fn peek_is_duplicate(&self, key: &[u8]) -> bool { - self.heap.peek().map(|s| s.input() == key).unwrap_or(false) - } - - fn pop_if_equal(&mut self, key: &[u8]) -> Option { - if self.peek_is_duplicate(key) { - self.pop() - } else { - None - } - } - - fn pop_if_le(&mut self, key: &[u8]) -> Option { - if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { - self.pop() - } else { - None - } - } - - fn num_slots(&self) -> usize { - self.rdrs.len() - } - - fn refill(&mut self, mut slot: SlotIndexedValue) { - if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() { - slot.set_input(input); - for values in ivalues { - slot.set_aut_index(values.index); - slot.set_output(values.value); - self.heap.push(slot.clone()); - } - } - } -} - -#[derive(Debug, Clone)] -struct SlotIndexedValue { - rdr_index: usize, - aut_index: usize, - input: Rc>, - output: Output, -} - -#[derive(Debug)] -pub struct IndexedValue { - pub rdr_index: usize, - pub aut_index: usize, - pub value: u64, -} - -impl PartialEq for SlotIndexedValue { - fn eq(&self, other: &Self) -> bool { - (&self.input, self.rdr_index, self.aut_index, self.output) - .eq(&(&other.input, other.rdr_index, other.aut_index, other.output)) - } -} - -impl Eq for SlotIndexedValue { } - -impl PartialOrd for SlotIndexedValue { - fn partial_cmp(&self, other: &Self) -> Option { - (&self.input, self.rdr_index, self.aut_index, self.output) - .partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output)) - .map(|ord| ord.reverse()) - } -} - -impl Ord for SlotIndexedValue { - fn cmp(&self, other: &Self) -> cmp::Ordering { - self.partial_cmp(other).unwrap() - } -} - -impl SlotIndexedValue { - fn new(rdr_index: usize) -> SlotIndexedValue { - SlotIndexedValue { - rdr_index: rdr_index, - aut_index: 0, - input: Rc::new(Vec::with_capacity(64)), - output: Output::zero(), - } - } - - fn indexed_value(&self) -> IndexedValue { - IndexedValue { - rdr_index: self.rdr_index, - aut_index: self.aut_index, - value: self.output.value(), - } - } - - fn input(&self) -> &[u8] { - &self.input - } - - fn set_aut_index(&mut self, aut_index: usize) { - self.aut_index = aut_index; - } - - fn set_input(&mut self, input: &[u8]) { - if *self.input != input { - let inner = Rc::make_mut(&mut self.input); - inner.clear(); - inner.extend(input); - } - } - - fn set_output(&mut self, output: u64) { - self.output = Output::new(output); - } -} diff --git a/src/metadata/stream_ops.rs b/src/metadata/stream_ops.rs deleted file mode 100644 index 230a54b07..000000000 --- a/src/metadata/stream_ops.rs +++ /dev/null @@ -1,309 +0,0 @@ -use std::rc::Rc; -use std::collections::{BinaryHeap, HashMap, BTreeMap}; -use std::cmp; -use fst::{IntoStreamer, Streamer}; -use sdset::multi::OpBuilder as SdOpBuilder; -use sdset::{SetOperation, Set}; -use crate::metadata::ops::IndexedDocIndexes; -use crate::vec_read_only::VecReadOnly; -use crate::DocIndex; - -type BoxedStream<'f> = Box Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])> + 'f>; - -pub struct OpBuilder<'f> { - streams: Vec>, -} - -impl<'f> OpBuilder<'f> { - pub fn new() -> Self { - Self { streams: Vec::new() } - } - - /// Push a stream of `IndexedDocIndexes`. - /// - /// # Warning - /// - /// You must ensure yourself that the automatons are - /// all the same in the same order for each stream you push. - pub fn push(&mut self, stream: I) - where - I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>, - { - self.streams.push(Box::new(stream.into_stream())); - } - - pub fn union(self) -> Union<'f> { - Union { - heap: StreamHeap::new(self.streams), - outs: Vec::new(), - cur_slot: None, - } - } - - pub fn intersection(self) -> Intersection<'f> { - Intersection { - heap: StreamHeap::new(self.streams), - outs: Vec::new(), - cur_slot: None, - } - } - - pub fn difference(self) -> Difference<'f> { - Difference { - heap: StreamHeap::new(self.streams), - outs: Vec::new(), - cur_slot: None, - } - } - - pub fn symmetric_difference(self) -> SymmetricDifference<'f> { - SymmetricDifference { - heap: StreamHeap::new(self.streams), - outs: Vec::new(), - cur_slot: None, - } - } -} - -// FIXME reuse it from metadata::ops -struct SlotIndexedDocIndexes { - aut_index: usize, - start: usize, - len: usize, -} - -macro_rules! logical_operation { - (struct $name:ident, $operation:ident) => { - -pub struct $name<'f> { - heap: StreamHeap<'f>, - outs: Vec, - cur_slot: Option, -} - -impl<'a, 'f> Streamer<'a> for $name<'f> { - type Item = (&'a [u8], &'a [IndexedDocIndexes]); - - // The Metadata could be types as "key-values present" and "key-values possibly not present" - // in other words Metadata that "needs" to have key-values and other that doesn't needs. - // - // We could probably allow the user to define in Metadata some Document - // that needs to be deleted and only declare the DocumentId, and not every DocIndex of each words. - fn next(&'a mut self) -> Option { - if let Some(slot) = self.cur_slot.take() { - self.heap.refill(slot); - } - let slot = match self.heap.pop() { - None => return None, - Some(slot) => { - self.cur_slot = Some(slot); - self.cur_slot.as_mut().unwrap() - } - }; - - self.outs.clear(); - - // retrieve all the doc_indexes of all the streams, - // store them in an HashMap which the key is - // the aut_index (associated with the state that is ignored), - // the doc_indexes must be stored in another BTreeMap which the key - // is the rdr_index. - // - // This will permit us to do set operations on readers (using the rdr_index) - // the BTreeMap will gives the rdr_index in order and the final result - // will be aggregated in a Vec of IndexedDocIndexes which the aut_index and state - // are the key of the first HashMap - - // TODO use the fnv Hasher! - - let mut builders = HashMap::new(); - let iv = slot.indexed_value(); - let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new); - builder.insert(slot.rdr_index, iv.doc_indexes); - - while let Some(mut slot) = self.heap.pop_if_equal(slot.input()) { - let iv = slot.indexed_value(); - let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new); - builder.insert(slot.rdr_index, iv.doc_indexes); - - self.heap.refill(slot); - } - - // now that we have accumulated all the doc_indexes like so: - // HashMap<(aut_index, state*), BtreeMap> - // we will be able to retrieve, for each aut_index, the doc_indexes - // that are needed to do the set operation - - let mut doc_indexes = Vec::new(); - let mut doc_indexes_slots = Vec::with_capacity(builders.len()); - for (aut_index, values) in builders { - - let sets = values.iter().map(|(_, v)| Set::new_unchecked(v.as_slice())).collect(); - let builder = SdOpBuilder::from_vec(sets); - - let start = doc_indexes.len(); - builder.$operation().extend_vec(&mut doc_indexes); - let len = doc_indexes.len() - start; - if len == 0 { continue } - - let slot = SlotIndexedDocIndexes { - aut_index: aut_index, - start: start, - len: len, - }; - doc_indexes_slots.push(slot); - } - - let read_only = VecReadOnly::new(doc_indexes); - self.outs.reserve(doc_indexes_slots.len()); - for slot in doc_indexes_slots { - let indexes = IndexedDocIndexes { - index: slot.aut_index, - doc_indexes: read_only.range(slot.start, slot.len), - }; - self.outs.push(indexes); - } - - if self.outs.is_empty() { return None } - Some((slot.input(), &self.outs)) - } -} -}} - -logical_operation!(struct Union, union); -logical_operation!(struct Intersection, intersection); -logical_operation!(struct Difference, difference); -logical_operation!(struct SymmetricDifference, symmetric_difference); - -struct StreamHeap<'f> { - rdrs: Vec>, - heap: BinaryHeap, -} - -impl<'f> StreamHeap<'f> { - fn new(streams: Vec>) -> StreamHeap<'f> { - let mut heap = StreamHeap { - rdrs: streams, - heap: BinaryHeap::new(), - }; - for i in 0..heap.rdrs.len() { - heap.refill(Slot::new(i)); - } - heap - } - - fn pop(&mut self) -> Option { - self.heap.pop() - } - - fn peek_is_duplicate(&self, key: &[u8]) -> bool { - self.heap.peek().map(|s| s.input() == key).unwrap_or(false) - } - - fn pop_if_equal(&mut self, key: &[u8]) -> Option { - if self.peek_is_duplicate(key) { - self.pop() - } else { - None - } - } - - fn pop_if_le(&mut self, key: &[u8]) -> Option { - if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { - self.pop() - } else { - None - } - } - - fn num_slots(&self) -> usize { - self.rdrs.len() - } - - fn refill(&mut self, mut slot: Slot) { - if let Some((input, outputs)) = self.rdrs[slot.rdr_index].next() { - slot.set_input(input); - for output in outputs { - slot.set_aut_index(output.index); - slot.set_output(output.doc_indexes.clone()); - self.heap.push(slot.clone()); - } - } - } -} - -#[derive(Debug, Clone)] -struct Slot { - rdr_index: usize, - aut_index: usize, - input: Rc>, - output: Option>, -} - -impl PartialEq for Slot { - fn eq(&self, other: &Self) -> bool { - (&self.input, self.rdr_index, self.aut_index) - .eq(&(&other.input, other.rdr_index, other.aut_index)) - } -} - -impl Eq for Slot { } - -impl PartialOrd for Slot { - fn partial_cmp(&self, other: &Self) -> Option { - (&self.input, self.rdr_index, self.aut_index) - .partial_cmp(&(&other.input, other.rdr_index, other.aut_index)) - .map(|ord| ord.reverse()) - } -} - -impl Ord for Slot { - fn cmp(&self, other: &Self) -> cmp::Ordering { - self.partial_cmp(other).unwrap() - } -} - -impl Slot { - fn new(rdr_index: usize) -> Self { - Slot { - rdr_index: rdr_index, - aut_index: 0, - input: Rc::new(Vec::with_capacity(64)), - output: None, - } - } - - fn indexed_value(&mut self) -> IndexedDocIndexes { - IndexedDocIndexes { - index: self.aut_index, - doc_indexes: self.output.take().unwrap(), - } - } - - fn input(&self) -> &[u8] { - &self.input - } - - fn set_input(&mut self, input: &[u8]) { - if *self.input != input { - let inner = Rc::make_mut(&mut self.input); - inner.clear(); - inner.extend(input); - } - } - - fn set_aut_index(&mut self, aut_index: usize) { - self.aut_index = aut_index; - } - - fn set_output(&mut self, output: VecReadOnly) { - self.output = Some(output); - } -} - -#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct IndexedValueWithState { - pub index: usize, - pub value: u64, -} diff --git a/src/pentium.rs b/src/pentium.rs deleted file mode 100644 index c9421ca46..000000000 --- a/src/pentium.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::error::Error; - -use crate::automaton; -use crate::rank::Document; -use crate::index::Index; - -pub struct Pentium { - index: Index, -} - -impl Pentium { - pub fn from_index(index: Index) -> Result> { - unimplemented!() - } - - pub fn search(&self, query: &str) -> Vec { - - let mut automatons = Vec::new(); - for word in query.split_whitespace().map(str::to_lowercase) { - let dfa = automaton::build_prefix_dfa(&word); - automatons.push(dfa); - } - - let stream = unimplemented!(); - - unimplemented!() - } -}