feat(search): Accept multiple words and do a simple union

This commit is contained in:
Kerollmops 2018-05-06 12:23:42 +02:00 committed by Clément Renault
parent 758baeb8e1
commit 1476aa3dba
5 changed files with 208 additions and 37 deletions

View file

@ -13,14 +13,14 @@ use std::path::Path;
use std::fs::File;
use std::io::{Read, BufReader};
use fst::{IntoStreamer, Streamer};
use fst::Streamer;
use futures::future;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use tokio_minihttp::{Request, Response, Http};
use tokio_proto::TcpServer;
use tokio_service::Service;
use raptor::FstMap;
use raptor::{FstMap, OpWithStateBuilder};
static mut MAP: Option<FstMap<u64>> = None;
static mut LEV_BUILDER_0: Option<LevBuilder> = None;
@ -52,25 +52,40 @@ impl<'a> Service for MainService<'a> {
if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") {
let query = query.to_lowercase();
let lev = if query.len() <= 4 {
self.lev_builder_0.build_dfa(&query)
} else if query.len() <= 8 {
self.lev_builder_1.build_dfa(&query)
} else {
self.lev_builder_2.build_dfa(&query)
};
let mut automatons = Vec::new();
let mut stream = self.map.search(&lev).with_state().into_stream();
for query in query.split_whitespace() {
let lev = if query.len() <= 4 {
self.lev_builder_0.build_dfa(&query)
} else if query.len() <= 8 {
self.lev_builder_1.build_dfa(&query)
} else {
self.lev_builder_2.build_dfa(&query)
};
automatons.push(lev);
}
let mut op = OpWithStateBuilder::new(self.map.values());
for automaton in automatons.iter().cloned() {
let stream = self.map.as_map().search(automaton).with_state();
op.push(stream);
}
let mut stream = op.union();
let mut body = String::new();
body.push_str("<html><body>");
while let Some((key, values, state)) = stream.next() {
while let Some((key, ivalues)) = stream.next() {
match std::str::from_utf8(key) {
Ok(key) => {
let values = &values[..values.len().min(10)];
let distance = lev.distance(state);
body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, values));
for ivalue in ivalues {
let i = ivalue.index;
let state = ivalue.state;
let distance = automatons[i].distance(state);
body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, ivalue.values));
}
},
Err(e) => eprintln!("{:?}", e),
}

View file

@ -1,5 +1,5 @@
use bincode;
use fst::{self, Map, MapBuilder, Automaton};
use fst::{self, Automaton};
use serde::de::DeserializeOwned;
use serde::ser::Serialize;
use std::fs::File;
@ -10,7 +10,7 @@ use {StreamBuilder, Stream};
#[derive(Debug)]
pub struct FstMap<T> {
inner: Map,
inner: fst::Map,
values: Values<T>,
}
@ -21,7 +21,7 @@ impl<T> FstMap<T> {
P: AsRef<Path>,
Q: AsRef<Path>
{
let inner = Map::from_path(map)?;
let inner = fst::Map::from_path(map)?;
// TODO handle errors !!!
let values = File::open(values).unwrap();
@ -35,7 +35,7 @@ impl<T> FstMap<T> {
where
T: DeserializeOwned
{
let inner = Map::from_bytes(map)?;
let inner = fst::Map::from_bytes(map)?;
let values = bincode::deserialize(values).unwrap();
Ok(Self { inner, values })
@ -62,6 +62,19 @@ impl<T> FstMap<T> {
values: &self.values,
}
}
pub fn op(&self) -> OpBuilder<T> {
// OpBuilder::new(&self.values).add(self.as_inner())
unimplemented!()
}
pub fn as_map(&self) -> &fst::Map {
&self.inner
}
pub fn values(&self) -> &Values<T> {
&self.values
}
}
#[derive(Debug, Serialize, Deserialize)]
@ -137,7 +150,7 @@ impl<T> FstMapBuilder<T> {
pub fn build_memory(self) -> fst::Result<FstMap<T>> {
Ok(FstMap {
inner: Map::from_iter(self.map)?,
inner: fst::Map::from_iter(self.map)?,
values: Values::new(self.values),
})
}
@ -148,7 +161,7 @@ impl<T> FstMapBuilder<T> {
W: Write,
X: Write
{
let mut builder = MapBuilder::new(map_wrt)?;
let mut builder = fst::MapBuilder::new(map_wrt)?;
builder.extend_iter(self.map)?;
let map = builder.into_inner()?;
let values = Values::new(self.values);
@ -159,3 +172,148 @@ impl<T> FstMapBuilder<T> {
Ok((map, values_wrt))
}
}
pub struct OpBuilder<'m, 'v, T: 'v> {
inner: fst::map::OpBuilder<'m>,
values: &'v Values<T>,
}
impl<'m, 'v, T: 'v> OpBuilder<'m, 'v, T> {
pub fn new(values: &'v Values<T>) -> Self {
OpBuilder {
inner: fst::map::OpBuilder::new(),
values: values,
}
}
pub fn add<I, S>(mut self, streamable: I) -> Self
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.push(streamable);
self
}
pub fn push<I, S>(&mut self, streamable: I)
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.inner.push(streamable);
}
pub fn union(self) -> Union<'m, 'v, T> {
Union {
inner: self.inner.union(),
outs: Vec::new(),
values: self.values,
}
}
}
pub struct Union<'m, 'v, T: 'v> {
inner: fst::map::Union<'m>,
outs: Vec<IndexedValues<'v, T>>,
values: &'v Values<T>,
}
impl<'a, 'm, 'v, T: 'v + 'a> fst::Streamer<'a> for Union<'m, 'v, T> {
type Item = (&'a [u8], &'a [IndexedValues<'a, T>]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((s, ivalues)) => {
self.outs.clear();
for ivalue in ivalues {
let index = ivalue.index;
let values = unsafe { self.values.get_unchecked(ivalue.value as usize) };
self.outs.push(IndexedValues { index, values })
}
Some((s, &self.outs))
},
None => None,
}
}
}
#[derive(Debug)]
pub struct IndexedValues<'a, T: 'a> {
pub index: usize,
pub values: &'a [T],
}
pub struct OpWithStateBuilder<'m, 'v, T: 'v, U> {
inner: fst::map::OpWithStateBuilder<'m, U>,
values: &'v Values<T>,
}
impl<'m, 'v, T: 'v, U: 'static> OpWithStateBuilder<'m, 'v, T, U> {
pub fn new(values: &'v Values<T>) -> Self {
Self {
inner: fst::map::OpWithStateBuilder::new(),
values: values,
}
}
pub fn add<I, S>(mut self, streamable: I) -> Self
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
{
self.push(streamable);
self
}
pub fn push<I, S>(&mut self, streamable: I)
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
{
self.inner.push(streamable);
}
pub fn union(self) -> UnionWithState<'m, 'v, T, U> {
UnionWithState {
inner: self.inner.union(),
outs: Vec::new(),
values: self.values,
}
}
}
pub struct UnionWithState<'m, 'v, T: 'v, U> {
inner: fst::map::UnionWithState<'m, U>,
outs: Vec<IndexedValuesWithState<'v, T, U>>,
values: &'v Values<T>,
}
impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, T, U>
where
U: Clone,
{
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((s, ivalues)) => {
self.outs.clear();
for ivalue in ivalues {
let index = ivalue.index;
let values = unsafe { self.values.get_unchecked(ivalue.value as usize) };
let state = ivalue.state.clone();
self.outs.push(IndexedValuesWithState { index, values, state })
}
Some((s, &self.outs))
},
None => None,
}
}
}
#[derive(Debug)]
pub struct IndexedValuesWithState<'a, T: 'a, U> {
pub index: usize,
pub values: &'a [T],
pub state: U,
}

View file

@ -5,13 +5,13 @@ extern crate serde;
mod fst_map;
use std::ops::Range;
use std::io::{Write, BufReader};
use std::fs::File;
use std::path::Path;
use fst::Automaton;
pub use self::fst_map::{FstMap, FstMapBuilder};
pub use self::fst_map::{
OpBuilder, IndexedValues,
OpWithStateBuilder, IndexedValuesWithState,
};
use self::fst_map::Values;
pub struct StreamBuilder<'m, 'v, T: 'v, A> {