From 96d2fbcd3d274473d5e51a6ab0346fb837e4bccb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 22 Apr 2018 20:06:56 +0200 Subject: [PATCH] map: Allow using the Levenshtein algorithm to search --- Cargo.lock | 17 +++++++++++++++++ Cargo.toml | 9 +++++---- src/bin/raptor.rs | 18 ++++++++++++++++-- src/lib.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2c1c20b8..43c292aa8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,15 @@ dependencies = [ "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "fst-levenshtein" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "fuchsia-zircon" version = "0.3.3" @@ -306,6 +315,7 @@ dependencies = [ "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", @@ -598,6 +608,11 @@ dependencies = [ "percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "utf8-ranges" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.2.8" @@ -650,6 +665,7 @@ dependencies = [ "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d94485a00b1827b861dd9d1a2cc9764f9044d4c535514c0760a5a2012ef3399f" +"checksum fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "64f12af1569dd78afbefe476034bbdce0372d18e9dc75b634bde0e7b8bf994c8" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" @@ -706,6 +722,7 @@ dependencies = [ "checksum unicode-normalization 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "51ccda9ef9efa3f7ef5d91e8f9b83bbe6955f9bf86aec89d5cce2c874625920f" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f808aadd8cfec6ef90e4a14eb46f24511824d1ac596b9682703c87056c8678b7" +"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/Cargo.toml b/Cargo.toml index 8cfc26b34..b179172c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,12 +7,13 @@ authors = ["Kerollmops "] bincode = "1.0" env_logger = { version = "0.3", default-features = false } fst = "0.3" +fst-levenshtein = "0.2" futures = "0.1" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0" +smallvec = { version = "0.6", features = ["serde"] } tokio-minihttp = { git = "https://github.com/tokio-rs/tokio-minihttp.git" } tokio-proto = "0.1" tokio-service = "0.1" -serde = "1.0" -serde_json = "1.0" -serde_derive = "1.0" -smallvec = { version = "0.6", features = ["serde"] } url = "1.7" diff --git a/src/bin/raptor.rs b/src/bin/raptor.rs index 38c2b96be..bf6da7357 100644 --- a/src/bin/raptor.rs +++ b/src/bin/raptor.rs @@ -1,4 +1,6 @@ extern crate env_logger; +extern crate fst; +extern crate fst_levenshtein; extern crate futures; extern crate raptor; extern crate tokio_minihttp; @@ -8,6 +10,8 @@ extern crate url; use std::io; +use fst_levenshtein::Levenshtein; +use fst::{IntoStreamer, Streamer}; use futures::future; use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; @@ -34,8 +38,18 @@ impl Service for MainService { if let Some((_, key)) = url.query_pairs().find(|&(ref k, _)| k == "q") { let key = key.to_lowercase(); - let values = self.map.get(&key).map(|a| &a[..10]); - resp.body(&format!("{:?}", values)); + + let lev = Levenshtein::new(&key, 2).unwrap(); + + let mut body = String::new(); + + let mut stream = self.map.search(lev).into_stream(); + while let Some((key, values)) = stream.next() { + let values = &values[..values.len().min(10)]; + body.push_str(&format!("{:?} {:?}\n", key, values)); + } + + resp.body(&body); } future::ok(resp) diff --git a/src/lib.rs b/src/lib.rs index 1928a4e70..9d494c80e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ extern crate serde_json; #[macro_use] extern crate serde_derive; extern crate smallvec; +use std::ops::{Deref, DerefMut}; use std::io::Write; use std::fs::File; use std::path::Path; @@ -50,6 +51,45 @@ impl MultiMap { pub fn get>(&self, key: K) -> Option<&[u64]> { self.map.get(key).map(|i| &*self.values[i as usize]) } + + pub fn search(&self, aut: A) -> StreamBuilder { + StreamBuilder { + inner: self.map.search(aut), + values: &self.values, + } + } +} + +pub struct StreamBuilder<'a, A: fst::Automaton> { + inner: fst::map::StreamBuilder<'a, A>, + values: &'a [SmallVec32], +} + +impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> { + type Target = fst::map::StreamBuilder<'a, A>; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl<'a, A: fst::Automaton> DerefMut for StreamBuilder<'a, A> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> { + type Item = (&'a str, &'a [u64]); + + type Into = Stream<'a, A>; + + fn into_stream(self) -> Self::Into { + Stream { + inner: self.inner.into_stream(), + values: self.values, + } + } } pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> {