From 31e04f01204f1331c54438ff97f62ee0dd5b2321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 9 Sep 2018 11:13:58 +0200 Subject: [PATCH] feat: Simplify the levenshtein construction --- Cargo.lock | 34 ++++++++---- raptor-indexer/Cargo.toml | 3 ++ raptor-indexer/src/main.rs | 7 +-- raptor-search/Cargo.toml | 5 +- raptor-search/src/main.rs | 16 ++---- raptor/Cargo.toml | 8 ++- raptor/src/automaton.rs | 50 ++++++++++++++++++ raptor/src/levenshtein.rs | 37 ------------- raptor/src/lib.rs | 20 +++---- raptor/src/metadata.rs | 64 ++++++++++------------- raptor/src/rank/exact.rs | 4 +- raptor/src/rank/mod.rs | 29 +++++----- raptor/src/rank/number_of_words.rs | 4 +- raptor/src/rank/sum_of_typos.rs | 4 +- raptor/src/rank/sum_of_words_attribute.rs | 4 +- raptor/src/rank/sum_of_words_position.rs | 4 +- raptor/src/rank/words_proximity.rs | 4 +- 17 files changed, 156 insertions(+), 141 deletions(-) create mode 100644 raptor/src/automaton.rs delete mode 100644 raptor/src/levenshtein.rs diff --git a/Cargo.lock b/Cargo.lock index 27df80fe1..7e9ff0d16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -72,8 +72,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "fst" -version = "0.3.0" -source = "git+https://github.com/Kerollmops/fst.git?branch=always-match-clone#56eb2221d1534883d4e10887d945a982b780fccd" +version = "0.3.2" +source = "git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref#ca3a1ebb60a6f9123f1284de380c7a5fc05d16bb" dependencies = [ "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -113,12 +113,20 @@ name = "itoa" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "lazy_static" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "levenshtein_automata" version = "0.1.1" -source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#ed1244d1731b0f81e880f0c9daa860970d7752c3" +source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst#01400dfc181425a482cb6cad66f2a61b78b59e14" dependencies = [ - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", + "fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)", ] [[package]] @@ -225,9 +233,10 @@ version = "0.1.0" dependencies = [ "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", + "fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)", "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", - "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", ] @@ -249,7 +258,7 @@ name = "raptor-search" version = "0.1.0" dependencies = [ "elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", + "fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)", "raptor 0.1.0", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", ] @@ -329,6 +338,11 @@ name = "vcpkg" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "version_check" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.5" @@ -371,14 +385,15 @@ dependencies = [ "checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" "checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" -"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)" = "" +"checksum fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)" = "" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606" -"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" +"checksum lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca488b89a5657b0a2ecd45b95609b3e848cf1755da332a0da46e2b2b1cb371a7" +"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)" = "" "checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d" "checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" "checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "" @@ -400,6 +415,7 @@ dependencies = [ "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" "checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" +"checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/raptor-indexer/Cargo.toml b/raptor-indexer/Cargo.toml index d44f98698..bb8da0325 100644 --- a/raptor-indexer/Cargo.toml +++ b/raptor-indexer/Cargo.toml @@ -1,4 +1,7 @@ +cargo-features = ["edition"] + [package] +edition = "2018" name = "raptor-indexer" version = "0.1.0" authors = ["Kerollmops "] diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index 76d383758..f99d68383 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -1,12 +1,7 @@ // TODO make the raptor binary expose multiple subcommand // make only one binary -extern crate raptor; -extern crate rocksdb; -extern crate serde_json; #[macro_use] extern crate serde_derive; -extern crate unidecode; -extern crate moby_name_gen; use std::path::Path; use std::collections::{HashSet, BTreeMap}; @@ -129,7 +124,7 @@ fn main() { for (key, value) in fields { sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); } - let sst_file_info = sst_file_writer.finish().unwrap(); + let _sst_file_info = sst_file_writer.finish().unwrap(); builder.finish().unwrap(); diff --git a/raptor-search/Cargo.toml b/raptor-search/Cargo.toml index db66a60a1..fb16a0ee2 100644 --- a/raptor-search/Cargo.toml +++ b/raptor-search/Cargo.toml @@ -1,4 +1,7 @@ +cargo-features = ["edition"] + [package] +edition = "2018" name = "raptor-search" version = "0.1.0" authors = ["Kerollmops "] @@ -9,7 +12,7 @@ elapsed = "0.1" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" -branch = "always-match-clone" +branch = "automaton-for-deref" [dependencies.rocksdb] git = "https://github.com/pingcap/rust-rocksdb.git" diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index cebe70b8e..1262d1c62 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -1,20 +1,15 @@ -extern crate rocksdb; -extern crate fst; -extern crate raptor; -extern crate elapsed; - use std::env; use std::str::from_utf8_unchecked; use std::io::{self, Write}; use elapsed::measure_time; use fst::Streamer; use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; -use raptor::{Metadata, RankedStream, LevBuilder}; +use raptor::{automaton, Metadata, RankedStream}; -fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) { +fn search(metadata: &Metadata, database: &DB, query: &str) { let mut automatons = Vec::new(); for query in query.split_whitespace() { - let lev = lev_builder.get_automaton(query); + let lev = automaton::build(query); automatons.push(lev); } @@ -55,9 +50,6 @@ fn main() { }); println!("{} to load the SST file in RocksDB and reopen it for read-only", elapsed); - let (elapsed, lev_builder) = measure_time(|| LevBuilder::new()); - println!("{} to load the levenshtein automaton", elapsed); - loop { print!("Searching for: "); io::stdout().flush().unwrap(); @@ -68,7 +60,7 @@ fn main() { if query.is_empty() { break } - let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query)); + let (elapsed, _) = measure_time(|| search(&meta, &db, &query)); println!("Finished in {}", elapsed); } } diff --git a/raptor/Cargo.toml b/raptor/Cargo.toml index 006bb4a29..c1b83a6d9 100644 --- a/raptor/Cargo.toml +++ b/raptor/Cargo.toml @@ -1,4 +1,7 @@ +cargo-features = ["edition"] + [package] +edition = "2018" name = "raptor" version = "0.1.0" authors = ["Kerollmops "] @@ -6,14 +9,15 @@ authors = ["Kerollmops "] [dependencies] byteorder = "1.2" fnv = "1.0" +lazy_static = "1.1" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" -branch = "always-match-clone" +branch = "automaton-for-deref" [dependencies.levenshtein_automata] git = "https://github.com/Kerollmops/levenshtein-automata.git" -branch = "custom-fst" +branch = "new-custom-fst" features = ["fst_automaton"] [dependencies.rocksdb] diff --git a/raptor/src/automaton.rs b/raptor/src/automaton.rs new file mode 100644 index 000000000..c91b729e2 --- /dev/null +++ b/raptor/src/automaton.rs @@ -0,0 +1,50 @@ +use std::ops::Deref; +use fst::Automaton; +use levenshtein_automata::{ + LevenshteinAutomatonBuilder as LevBuilder, + DFA, Distance, +}; + +lazy_static! { + static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false); + static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false); + static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false); +} + +pub struct DfaExt { + query_len: usize, + automaton: DFA, +} + +impl Deref for DfaExt { + type Target = DFA; + + fn deref(&self) -> &Self::Target { + &self.automaton + } +} + +pub fn build(query: &str) -> DfaExt { + let dfa = match query.len() { + 0 ..= 4 => LEVDIST0.build_prefix_dfa(query), + 5 ..= 8 => LEVDIST1.build_prefix_dfa(query), + _ => LEVDIST2.build_prefix_dfa(query), + }; + + DfaExt { query_len: query.len(), automaton: dfa } +} + +pub trait AutomatonExt: Automaton { + fn eval>(&self, s: B) -> Distance; + fn query_len(&self) -> usize; +} + +impl AutomatonExt for DfaExt { + fn eval>(&self, s: B) -> Distance { + self.automaton.eval(s) + } + + fn query_len(&self) -> usize { + self.query_len + } +} diff --git a/raptor/src/levenshtein.rs b/raptor/src/levenshtein.rs deleted file mode 100644 index bf6ddbf03..000000000 --- a/raptor/src/levenshtein.rs +++ /dev/null @@ -1,37 +0,0 @@ -use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA}; - -pub struct LevBuilder { - automatons: [LevenshteinAutomatonBuilder; 3], -} - -impl LevBuilder { - pub fn new() -> Self { - Self { - automatons: [ - LevenshteinAutomatonBuilder::new(0, false), - LevenshteinAutomatonBuilder::new(1, false), - LevenshteinAutomatonBuilder::new(2, false), - ], - } - } - - pub fn get_automaton(&self, query: &str) -> Levenshtein { - assert!(!query.is_empty()); - - let dfa = if query.len() <= 4 { - self.automatons[0].build_prefix_dfa(query) - } else if query.len() <= 8 { - self.automatons[1].build_prefix_dfa(query) - } else { - self.automatons[2].build_prefix_dfa(query) - }; - - Levenshtein { dfa, query_len: query.len() } - } -} - -#[derive(Clone)] -pub struct Levenshtein { - pub dfa: DFA, - pub query_len: usize, -} diff --git a/raptor/src/lib.rs b/raptor/src/lib.rs index 7e6de58e7..269495118 100644 --- a/raptor/src/lib.rs +++ b/raptor/src/lib.rs @@ -1,24 +1,16 @@ -#![feature(nll)] - -extern crate fst; -extern crate fnv; -extern crate group_by; -extern crate levenshtein_automata; -extern crate byteorder; -extern crate rocksdb; +#[macro_use] extern crate lazy_static; pub mod rank; pub mod metadata; -pub mod levenshtein; +pub mod automaton; pub use self::metadata::{ Metadata, MetadataBuilder, - StreamWithState, StreamWithStateBuilder, - UnionWithState, OpWithStateBuilder, - IndexedValuesWithState, + Stream, StreamBuilder, + Union, OpBuilder, + IndexedValues, }; -pub use self::rank::{RankedStream}; -pub use self::levenshtein::LevBuilder; +pub use self::rank::RankedStream; pub type DocumentId = u64; diff --git a/raptor/src/metadata.rs b/raptor/src/metadata.rs index 3958e6647..9be54e268 100644 --- a/raptor/src/metadata.rs +++ b/raptor/src/metadata.rs @@ -9,7 +9,7 @@ use std::mem; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::{self, Map, MapBuilder, Automaton}; use fst::raw::MmapReadOnly; -use DocIndex; +use crate::DocIndex; #[repr(C)] struct Range { @@ -256,23 +256,23 @@ unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { from_raw_parts(ptr, len) } -pub struct OpWithStateBuilder<'m, 'v, U> { - inner: fst::map::OpWithStateBuilder<'m, U>, +pub struct OpBuilder<'m, 'v> { + inner: fst::map::OpBuilder<'m>, indexes: &'v DocIndexes, } -impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> { +impl<'m, 'v> OpBuilder<'m, 'v> { pub fn new(indexes: &'v DocIndexes) -> Self { Self { - inner: fst::map::OpWithStateBuilder::new(), + inner: fst::map::OpBuilder::new(), indexes: indexes, } } pub fn add(mut self, streamable: I) -> Self where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, - S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, { self.push(streamable); self @@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> { pub fn push(&mut self, streamable: I) where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, - S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, { self.inner.push(streamable); } - pub fn union(self) -> UnionWithState<'m, 'v, U> { - UnionWithState { + pub fn union(self) -> Union<'m, 'v> { + Union { inner: self.inner.union(), outs: Vec::new(), indexes: self.indexes, @@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> { } #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct IndexedValuesWithState<'a, U> { +pub struct IndexedValues<'a> { pub index: usize, pub values: &'a [DocIndex], - pub state: U, } -pub struct UnionWithState<'m, 'v, U> { - inner: fst::map::UnionWithState<'m, U>, - outs: Vec>, +pub struct Union<'m, 'v> { + inner: fst::map::Union<'m>, + outs: Vec>, indexes: &'v DocIndexes, } -impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U> -where - U: Clone, -{ - type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]); +impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> { + type Item = (&'a [u8], &'a [IndexedValues<'a>]); fn next(&'a mut self) -> Option { match self.inner.next() { @@ -322,8 +318,7 @@ where for ivalue in ivalues { if let Some(values) = self.indexes.get(ivalue.value) { let index = ivalue.index; - let state = ivalue.state.clone(); - self.outs.push(IndexedValuesWithState { index, values, state }) + self.outs.push(IndexedValues { index, values }) } } Some((s, &self.outs)) @@ -333,44 +328,43 @@ where } } -pub struct StreamWithStateBuilder<'m, 'v, A> { - inner: fst::map::StreamWithStateBuilder<'m, A>, +pub struct StreamBuilder<'m, 'v, A> { + inner: fst::map::StreamBuilder<'m, A>, indexes: &'v DocIndexes, } -impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A> +impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A> where A: Automaton, A::State: Clone, { type Item = >::Item; - type Into = StreamWithState<'m, 'v, A>; + type Into = Stream<'m, 'v, A>; fn into_stream(self) -> Self::Into { - StreamWithState { + Stream { inner: self.inner.into_stream(), indexes: self.indexes, } } } -pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> { - inner: fst::map::StreamWithState<'m, A>, +pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> { + inner: fst::map::Stream<'m, A>, indexes: &'v DocIndexes, } -impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A> +impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A> where A: Automaton, - A::State: Clone, { - type Item = (&'a [u8], &'a [DocIndex], A::State); + type Item = (&'a [u8], &'a [DocIndex]); fn next(&'a mut self) -> Option { match self.inner.next() { - Some((key, i, state)) => { + Some((key, i)) => { match self.indexes.get(i) { - Some(values) => Some((key, values, state)), + Some(values) => Some((key, values)), None => None, } }, diff --git a/raptor/src/rank/exact.rs b/raptor/src/rank/exact.rs index 6fec39696..680369a0b 100644 --- a/raptor/src/rank/exact.rs +++ b/raptor/src/rank/exact.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; #[inline] fn contains_exact(matches: &[Match]) -> bool { diff --git a/raptor/src/rank/mod.rs b/raptor/src/rank/mod.rs index e86f88fa4..0c857e6aa 100644 --- a/raptor/src/rank/mod.rs +++ b/raptor/src/rank/mod.rs @@ -6,13 +6,14 @@ mod sum_of_words_position; mod exact; use std::cmp::Ordering; +use std::rc::Rc; use std::{mem, vec}; use fst; use fnv::FnvHashMap; -use levenshtein::Levenshtein; -use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState}; -use {Match, DocumentId}; use group_by::GroupByMut; +use crate::automaton::{DfaExt, AutomatonExt}; +use crate::metadata::{DocIndexes, OpBuilder, Union}; +use crate::{Match, DocumentId}; use self::{ sum_of_typos::sum_of_typos, @@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap>, limit: usize) pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>); impl<'m, 'v> RankedStream<'m, 'v> { - pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec, limit: usize) -> Self { - let mut op = OpWithStateBuilder::new(indexes); + pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec, limit: usize) -> Self { + let mut op = OpBuilder::new(indexes); - for automaton in automatons.iter().map(|l| l.dfa.clone()) { - let stream = map.search(automaton).with_state(); + let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect(); + for automaton in automatons.iter().cloned() { + let stream = map.search(automaton); op.push(stream); } @@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { enum RankedStreamInner<'m, 'v> { Fed { - inner: UnionWithState<'m, 'v, u32>, - automatons: Vec, + inner: Union<'m, 'v>, + automatons: Vec>, limit: usize, matches: FnvHashMap>, }, @@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> { for iv in indexed_values { let automaton = &automatons[iv.index]; - let distance = automaton.dfa.distance(iv.state).to_u8(); + let distance = automaton.eval(string).to_u8(); + let same_length = string.len() == automaton.query_len(); for di in iv.values { let match_ = Match { @@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> { distance: distance, attribute: di.attribute, attribute_index: di.attribute_index, - is_exact: distance == 0 && string.len() == automaton.query_len, + is_exact: distance == 0 && same_length, }; matches.entry(di.document) - .or_insert_with(Vec::new) - .push(match_); + .or_insert_with(Vec::new) + .push(match_); } } }, diff --git a/raptor/src/rank/number_of_words.rs b/raptor/src/rank/number_of_words.rs index 02066cf6f..e64b3f5e0 100644 --- a/raptor/src/rank/number_of_words.rs +++ b/raptor/src/rank/number_of_words.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; #[inline] fn number_of_query_words(matches: &[Match]) -> usize { diff --git a/raptor/src/rank/sum_of_typos.rs b/raptor/src/rank/sum_of_typos.rs index f4bc151c8..0b340ddc8 100644 --- a/raptor/src/rank/sum_of_typos.rs +++ b/raptor/src/rank/sum_of_typos.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; #[inline] fn sum_matches_typos(matches: &[Match]) -> u8 { diff --git a/raptor/src/rank/sum_of_words_attribute.rs b/raptor/src/rank/sum_of_words_attribute.rs index eddd8a009..3666df3f2 100644 --- a/raptor/src/rank/sum_of_words_attribute.rs +++ b/raptor/src/rank/sum_of_words_attribute.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; #[inline] fn sum_matches_attributes(matches: &[Match]) -> u8 { diff --git a/raptor/src/rank/sum_of_words_position.rs b/raptor/src/rank/sum_of_words_position.rs index 1aab9eff4..ccf075b8a 100644 --- a/raptor/src/rank/sum_of_words_position.rs +++ b/raptor/src/rank/sum_of_words_position.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; #[inline] fn sum_matches_attribute_index(matches: &[Match]) -> u32 { diff --git a/raptor/src/rank/words_proximity.rs b/raptor/src/rank/words_proximity.rs index 463431911..601897c3d 100644 --- a/raptor/src/rank/words_proximity.rs +++ b/raptor/src/rank/words_proximity.rs @@ -1,7 +1,7 @@ use std::cmp::{self, Ordering}; -use Match; -use rank::{match_query_index, Document}; use group_by::GroupBy; +use crate::Match; +use crate::rank::{match_query_index, Document}; const MAX_DISTANCE: u32 = 8;