From 9b58ffe2d91c0a46df08cee019d05a9251b15f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 27 Nov 2018 19:11:33 +0100 Subject: [PATCH] feat: Introduce the QueryBuilder struct --- src/blob/mod.rs | 2 +- src/database.rs | 42 +++++++ src/index/mod.rs | 6 +- src/index/update/positive_update.rs | 2 +- src/lib.rs | 7 +- src/rank/ranked_stream.rs | 175 +++++++++++++++++----------- 6 files changed, 159 insertions(+), 75 deletions(-) create mode 100644 src/database.rs diff --git a/src/blob/mod.rs b/src/blob/mod.rs index 37646f789..af0b52625 100644 --- a/src/blob/mod.rs +++ b/src/blob/mod.rs @@ -1,5 +1,5 @@ mod merge; -mod ops; +pub mod ops; mod ops_indexed_value; mod positive_blob; mod negative_blob; diff --git a/src/database.rs b/src/database.rs new file mode 100644 index 000000000..bcc25661e --- /dev/null +++ b/src/database.rs @@ -0,0 +1,42 @@ +use std::error::Error; +use std::ops::Deref; + +use ::rocksdb::rocksdb::{DB, Snapshot}; + +use crate::index::schema::Schema; +use crate::blob::PositiveBlob; +use crate::DocumentId; + +pub trait Retrieve { + fn schema(&self) -> Result, Box>; + fn data_index(&self) -> Result>; + fn get_documents(&self, ids: &[DocumentId]) -> Result, Box>; +} + +impl Retrieve for Snapshot +where T: Deref, +{ + fn schema(&self) -> Result, Box> { + match self.deref().get(b"data-schema")? { + Some(value) => Ok(Some(Schema::read_from(&*value)?)), + None => Ok(None), + } + } + + fn data_index(&self) -> Result> { + match self.deref().get(b"data-index")? { + Some(value) => Ok(bincode::deserialize(&value)?), + None => Ok(PositiveBlob::default()), + } + } + + fn get_documents(&self, ids: &[DocumentId]) -> Result, Box> { + if ids.is_empty() { return Ok(Vec::new()) } + let schema = match self.schema()? { + Some(schema) => schema, + None => return Err(String::from("BUG: could not find schema").into()), + }; + + unimplemented!() + } +} diff --git a/src/index/mod.rs b/src/index/mod.rs index 3e005e85e..842d6fb58 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -238,9 +238,9 @@ impl Index { let snapshot = self.database.snapshot(); let index_key = Identifier::data().index().build(); - let map = match snapshot.get(&index_key)? { + let blob = match snapshot.get(&index_key)? { Some(value) => bincode::deserialize(&value)?, - None => Vec::new(), + None => PositiveBlob::default(), }; let mut automatons = Vec::new(); @@ -250,7 +250,7 @@ impl Index { } let config = Config { - map: map, + blob: blob, automatons: automatons, criteria: criterion::default(), distinct: ((), 1), diff --git a/src/index/update/positive_update.rs b/src/index/update/positive_update.rs index d8a5a3796..2825406c1 100644 --- a/src/index/update/positive_update.rs +++ b/src/index/update/positive_update.rs @@ -8,7 +8,7 @@ use crate::index::update::Update; use crate::index::identifier::Identifier; use crate::index::schema::{SchemaProps, Schema, SchemaAttr}; use crate::tokenizer::TokenizerBuilder; -use crate::blob::{BlobInfo, PositiveBlobBuilder}; +use crate::blob::PositiveBlobBuilder; use crate::{DocIndex, DocumentId}; pub enum NewState { diff --git a/src/lib.rs b/src/lib.rs index 61edcc7e7..834971a40 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,13 +3,14 @@ #[macro_use] extern crate lazy_static; #[macro_use] extern crate serde_derive; -pub mod index; +pub mod automaton; pub mod blob; pub mod data; +pub mod database; +pub mod index; pub mod rank; -pub mod vec_read_only; -pub mod automaton; pub mod tokenizer; +pub mod vec_read_only; mod common_words; pub use self::tokenizer::Tokenizer; diff --git a/src/rank/ranked_stream.rs b/src/rank/ranked_stream.rs index 9e8fdcd54..9288dca8f 100644 --- a/src/rank/ranked_stream.rs +++ b/src/rank/ranked_stream.rs @@ -1,19 +1,24 @@ +use std::ops::{Deref, Range, RangeBounds}; use std::collections::HashMap; +use std::{mem, vec, str}; +use std::ops::Bound::*; +use std::error::Error; use std::hash::Hash; -use std::ops::Range; use std::rc::Rc; -use std::{mem, vec}; use fnv::FnvHashMap; use fst::Streamer; use group_by::GroupByMut; +use ::rocksdb::rocksdb::{DB, Snapshot}; -use crate::automaton::{DfaExt, AutomatonExt}; -use crate::index::Index; -use crate::blob::{Blob, Merge}; -use crate::rank::criterion::Criterion; -use crate::rank::Document; +use crate::automaton::{self, DfaExt, AutomatonExt}; +use crate::rank::criterion::{self, Criterion}; +use crate::blob::{PositiveBlob, Merge}; +use crate::blob::ops::Union; use crate::{Match, DocumentId}; +use crate::database::Retrieve; +use crate::rank::Document; +use crate::index::Index; fn clamp_range(range: Range, big: Range) -> Range { Range { @@ -22,40 +27,58 @@ fn clamp_range(range: Range, big: Range) -> Range { } } -pub struct Config<'a, C, F> { - pub blobs: &'a [Blob], - pub automatons: Vec, - pub criteria: Vec, - pub distinct: (F, usize), +fn split_whitespace_automatons(query: &str) -> Vec { + let mut automatons = Vec::new(); + for query in query.split_whitespace().map(str::to_lowercase) { + let lev = automaton::build_prefix_dfa(&query); + automatons.push(lev); + } + automatons } -pub struct RankedStream<'m, C, F> { - stream: crate::blob::Merge<'m>, - automatons: Vec>, +pub struct QueryBuilder, C> { + snapshot: Snapshot, + blob: PositiveBlob, criteria: Vec, - distinct: (F, usize), } -impl<'m, C, F> RankedStream<'m, C, F> { - pub fn new(config: Config<'m, C, F>) -> Self { - let automatons: Vec<_> = config.automatons.into_iter().map(Rc::new).collect(); - - RankedStream { - stream: Merge::with_automatons(automatons.clone(), config.blobs), - automatons: automatons, - criteria: config.criteria, - distinct: config.distinct, - } +impl> QueryBuilder> { + pub fn new(snapshot: Snapshot) -> Result> { + QueryBuilder::with_criteria(snapshot, criterion::default()) } } -impl<'m, C, F> RankedStream<'m, C, F> { - fn retrieve_all_documents(&mut self) -> Vec { +impl QueryBuilder +where T: Deref, +{ + pub fn with_criteria(snapshot: Snapshot, criteria: Vec) -> Result> { + let blob = snapshot.data_index()?; + Ok(QueryBuilder { snapshot, blob, criteria }) + } + + pub fn criteria(&mut self, criteria: Vec) -> &mut Self { + self.criteria = criteria; + self + } + + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder { + DistinctQueryBuilder { + snapshot: self.snapshot, + blob: self.blob, + criteria: self.criteria, + function: function, + size: size + } + } + + fn query_all(&self, query: &str) -> Vec { + let automatons = split_whitespace_automatons(query); + let mut stream: Union = unimplemented!(); let mut matches = FnvHashMap::default(); - while let Some((string, indexed_values)) = self.stream.next() { + while let Some((string, indexed_values)) = stream.next() { for iv in indexed_values { - let automaton = &self.automatons[iv.index]; + let automaton = &automatons[iv.index]; let distance = automaton.eval(string).to_u8(); let is_exact = distance == 0 && string.len() == automaton.query_len(); @@ -76,12 +99,12 @@ impl<'m, C, F> RankedStream<'m, C, F> { } } -impl<'a, C, F> RankedStream<'a, C, F> -where C: Criterion +impl QueryBuilder +where T: Deref, + C: Criterion, { - // TODO don't sort to much documents, we can skip useless sorts - pub fn retrieve_documents(mut self, range: Range) -> Vec { - let mut documents = self.retrieve_all_documents(); + pub fn query(&self, query: &str, range: impl RangeBounds) -> Vec { + let mut documents = self.query_all(query); let mut groups = vec![documents.as_mut_slice()]; for criterion in self.criteria { @@ -95,47 +118,65 @@ where C: Criterion } } - let range = clamp_range(range, 0..documents.len()); + // let range = clamp_range(range, 0..documents.len()); + let range: Range = unimplemented!(); documents[range].to_vec() } +} - pub fn retrieve_distinct_documents(mut self, range: Range) -> Vec - where F: Fn(&DocumentId) -> Option, - K: Hash + Eq, - { - let mut documents = self.retrieve_all_documents(); - let mut groups = vec![documents.as_mut_slice()]; +pub struct DistinctQueryBuilder, F, C> { + snapshot: Snapshot, + blob: PositiveBlob, + criteria: Vec, + function: F, + size: usize, +} - for criterion in self.criteria { - let tmp_groups = mem::replace(&mut groups, Vec::new()); +// pub struct Schema; +// pub struct DocDatabase; +// where F: Fn(&Schema, &DocDatabase) -> Option, +// K: Hash + Eq, - for group in tmp_groups { - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - for group in GroupByMut::new(group, |a, b| criterion.eq(a, b)) { - groups.push(group); - } - } - } +impl, F, C> DistinctQueryBuilder +where T: Deref, + C: Criterion, +{ + pub fn query(&self, query: &str, range: impl RangeBounds) -> Vec { + // let mut documents = self.retrieve_all_documents(); + // let mut groups = vec![documents.as_mut_slice()]; - let mut out_documents = Vec::with_capacity(range.len()); - let (distinct, limit) = self.distinct; - let mut seen = DistinctMap::new(limit); + // for criterion in self.criteria { + // let tmp_groups = mem::replace(&mut groups, Vec::new()); - for document in documents { - let accepted = match distinct(&document.id) { - Some(key) => seen.digest(key), - None => seen.accept_without_key(), - }; + // for group in tmp_groups { + // group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); + // for group in GroupByMut::new(group, |a, b| criterion.eq(a, b)) { + // groups.push(group); + // } + // } + // } - if accepted { - if seen.len() == range.end { break } - if seen.len() >= range.start { - out_documents.push(document); - } - } - } + // let mut out_documents = Vec::with_capacity(range.len()); + // let (distinct, limit) = self.distinct; + // let mut seen = DistinctMap::new(limit); - out_documents + // for document in documents { + // let accepted = match distinct(&document.id) { + // Some(key) => seen.digest(key), + // None => seen.accept_without_key(), + // }; + + // if accepted { + // if seen.len() == range.end { break } + // if seen.len() >= range.start { + // out_documents.push(document); + // } + // } + // } + + // out_documents + + unimplemented!() } }