diff --git a/Cargo.lock b/Cargo.lock index ab7d9c23e..4be02d973 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,6 +21,11 @@ dependencies = [ "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "group-by" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ecd46056a43a0094dec" + [[package]] name = "levenshtein_automata" version = "0.1.0" @@ -65,6 +70,7 @@ version = "0.1.0" dependencies = [ "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", @@ -123,6 +129,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0" "checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" diff --git a/Cargo.toml b/Cargo.toml index 6afd53fd3..5fd342f88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,3 +16,6 @@ branch = "op-builder-with-state" git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "custom-fst" features = ["fst_automaton"] + +[dependencies.group-by] +git = "https://github.com/Kerollmops/group-by.git" diff --git a/raptor-bin/Cargo.lock b/raptor-bin/Cargo.lock index d9ed788a9..9cdc1b50b 100644 --- a/raptor-bin/Cargo.lock +++ b/raptor-bin/Cargo.lock @@ -26,6 +26,11 @@ dependencies = [ "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "group-by" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ecd46056a43a0094dec" + [[package]] name = "itoa" version = "0.4.1" @@ -75,6 +80,7 @@ version = "0.1.0" dependencies = [ "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", @@ -154,6 +160,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c069bbec61e1ca5a596166e55dfe4773ff745c3d16b700013bcaff9a6df2c682" "checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" diff --git a/raptor-bin/src/main.rs b/raptor-bin/src/main.rs index 63b236a98..306e372af 100644 --- a/raptor-bin/src/main.rs +++ b/raptor-bin/src/main.rs @@ -56,7 +56,7 @@ fn main() { attribute: attr, attribute_index: i as u32, }; - builder.insert(word, doc_index); + builder.insert(word.to_lowercase(), doc_index); } } diff --git a/raptor-http/src/main.rs b/raptor-http/src/main.rs index 74c6d39fc..18b6cfdb3 100644 --- a/raptor-http/src/main.rs +++ b/raptor-http/src/main.rs @@ -16,7 +16,7 @@ use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; use tokio_service::Service; -use raptor::{DocIndexMap, OpWithStateBuilder, LevBuilder}; +use raptor::{DocIndexMap, RankedStream, LevBuilder}; struct MainService { map: Arc, @@ -48,30 +48,18 @@ impl Service for MainService { automatons.push(lev); } - let mut op = OpWithStateBuilder::new(self.map.values()); - - for automaton in automatons.iter().cloned() { - let stream = self.map.as_map().search(automaton).with_state(); - op.push(stream); - } - - let mut stream = op.union(); + let mut limit = 20; + let mut stream = RankedStream::new(&self.map, self.map.values(), automatons.clone()); let mut body = String::new(); body.push_str(""); - while let Some((key, ivalues)) = stream.next() { - match std::str::from_utf8(key) { - Ok(key) => { - for ivalue in ivalues { - let i = ivalue.index; - let state = ivalue.state; - let distance = automatons[i].distance(state); - body.push_str(&format!("

{:?} (dist: {:?}) {:?}

", key, distance, ivalue.values)); - } - }, - Err(e) => eprintln!("{:?}", e), - } + while let Some(document_id) = stream.next() { + if limit == 0 { break } + + body.push_str(&format!("

{:?}

", document_id)); + + limit -= 1; } body.push_str(""); diff --git a/raptor-search/.gitignore b/raptor-search/.gitignore new file mode 100644 index 000000000..70e3cae73 --- /dev/null +++ b/raptor-search/.gitignore @@ -0,0 +1,3 @@ + +/target +**/*.rs.bk diff --git a/raptor-search/Cargo.lock b/raptor-search/Cargo.lock new file mode 100644 index 000000000..8a94a05a5 --- /dev/null +++ b/raptor-search/Cargo.lock @@ -0,0 +1,188 @@ +[[package]] +name = "bincode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "byteorder" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cfg-if" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "env_logger" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fst" +version = "0.3.0" +source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33" +dependencies = [ + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "group-by" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ecd46056a43a0094dec" + +[[package]] +name = "levenshtein_automata" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77" +dependencies = [ + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", +] + +[[package]] +name = "libc" +version = "0.2.42" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "log" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memmap" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "proc-macro2" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "raptor" +version = "0.1.0" +dependencies = [ + "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", + "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", + "serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "raptor-search" +version = "0.1.0" +dependencies = [ + "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "raptor 0.1.0", + "serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "syn" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0" +"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" +"checksum cfg-if 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "405216fd8fe65f718daa7102ea808a946b6ce40c742998fbfd3463645552de18" +"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" +"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" +"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" +"checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" +"checksum log 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6fddaa003a65722a7fb9e26b0ce95921fe4ba590542ced664d8ce2fa26f9f3ac" +"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" +"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" +"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" +"checksum serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "e9a2d9a9ac5120e0f768801ca2b58ad6eec929dc9d1d616c162f208869c2ce95" +"checksum serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "0a90213fa7e0f5eac3f7afe2d5ff6b088af515052cc7303bd68c7e3b91a3fb79" +"checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/raptor-search/Cargo.toml b/raptor-search/Cargo.toml new file mode 100644 index 000000000..7b6e6d68d --- /dev/null +++ b/raptor-search/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "raptor-search" +version = "0.1.0" +authors = ["Kerollmops "] + +[dependencies] +env_logger = { version = "0.3", default-features = false } +raptor = { path = ".." } +serde = "1.0" +serde_derive = "1.0" + +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "op-builder-with-state" + +[profile.release] +lto = true +debug = true diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs new file mode 100644 index 000000000..caa118b8a --- /dev/null +++ b/raptor-search/src/main.rs @@ -0,0 +1,34 @@ +extern crate env_logger; +extern crate fst; +extern crate raptor; + +use std::{fs, env}; +use fst::Streamer; +use raptor::{DocIndexMap, RankedStream, LevBuilder}; + +fn main() { + drop(env_logger::init()); + + let lev_builder = LevBuilder::new(); + let map = { + let fst = fs::read("map.fst").unwrap(); + let values = fs::read("values.vecs").unwrap(); + DocIndexMap::from_bytes(fst, &values).unwrap() + }; + + let query = env::args().nth(1).expect("Please enter query words!"); + let query = query.to_lowercase(); + + println!("Searching for: {:?}", query); + + let mut automatons = Vec::new(); + for query in query.split_whitespace() { + let lev = lev_builder.build_automaton(query); + automatons.push(lev); + } + + let mut stream = RankedStream::new(&map, map.values(), automatons); + while let Some(document_id) = stream.next() { + println!("{:?}", document_id); + } +} diff --git a/src/lib.rs b/src/lib.rs index 0c2a78822..eb1ec0402 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,12 @@ #[macro_use] extern crate serde_derive; extern crate bincode; extern crate fst; +extern crate group_by; extern crate levenshtein_automata; extern crate serde; pub mod map; +pub mod rank; mod levenshtein; pub use self::map::{Map, MapBuilder, Values}; @@ -12,12 +14,14 @@ pub use self::map::{ OpBuilder, IndexedValues, OpWithStateBuilder, IndexedValuesWithState, }; - +pub use self::rank::{RankedStream}; pub use self::levenshtein::LevBuilder; pub type DocIndexMap = Map; pub type DocIndexMapBuilder = MapBuilder; +pub type DocumentId = u64; + /// This structure represent the position of a word /// in a document and its attributes. /// @@ -27,7 +31,7 @@ pub type DocIndexMapBuilder = MapBuilder; pub struct DocIndex { /// The document identifier where the word was found. - pub document: u64, + pub document: DocumentId, /// The attribute identifier in the document /// where the word was found. @@ -49,9 +53,16 @@ pub struct DocIndex { /// the way these structures are ordered between themselves. /// /// The word in itself is not important. +// TODO do data oriented programming ? very arrays ? #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct Match { + /// The word index in the query sentence. + /// Same as the `attribute_index` but for the query words. + /// + /// Used to retrieve the automaton that match this word. + pub query_index: u32, + /// The distance the word has with the query word /// (i.e. the Levenshtein distance). pub distance: u8, @@ -63,12 +74,6 @@ pub struct Match { /// can not have more than `2^8` attributes. pub attribute: u8, - /// The word index in the query sentence. - /// Same as the `attribute_index` but for the query words. - /// - /// Used to retrieve the automaton that match this word. - pub query_index: u32, - /// Where does this word is located in the attribute string /// (i.e. at the start or the end of the attribute). /// @@ -76,3 +81,23 @@ pub struct Match { /// this is because we index only the first 1000 words in an attribute. pub attribute_index: u32, } + +impl Match { + pub fn zero() -> Self { + Match { + query_index: 0, + distance: 0, + attribute: 0, + attribute_index: 0, + } + } + + pub fn max() -> Self { + Match { + query_index: u32::max_value(), + distance: u8::max_value(), + attribute: u8::max_value(), + attribute_index: u32::max_value(), + } + } +} diff --git a/src/map.rs b/src/map.rs index 7f2575b81..9b32918e9 100644 --- a/src/map.rs +++ b/src/map.rs @@ -285,12 +285,14 @@ impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, where U: Clone, { + // TODO prefer returning (&[u8], index, value T, state) one by one type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]); fn next(&'a mut self) -> Option { match self.inner.next() { Some((s, ivalues)) => { self.outs.clear(); + self.outs.reserve(ivalues.len()); for ivalue in ivalues { let index = ivalue.index; let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; diff --git a/src/rank.rs b/src/rank.rs new file mode 100644 index 000000000..cc2ab94ae --- /dev/null +++ b/src/rank.rs @@ -0,0 +1,372 @@ +use std::cmp::{self, Ordering}; +use std::{mem, vec}; +use std::collections::{HashSet, HashMap}; +use DocIndexMap; +use fst; +use levenshtein_automata::DFA; +use map::{ + OpWithStateBuilder, UnionWithState, + StreamWithStateBuilder, + Values, +}; +use {Match, DocIndex, DocumentId}; +use group_by::GroupBy; + +const MAX_DISTANCE: usize = 8; + +#[derive(Debug, Eq, Clone)] +pub struct Document { + document_id: DocumentId, + matches: Vec, +} + +impl Document { + pub fn new(doc: DocumentId, match_: Match) -> Self { + Self::from_sorted_matches(doc, vec![match_]) + } + + pub fn from_sorted_matches(doc: DocumentId, matches: Vec) -> Self { + Self { + document_id: doc, + matches: matches, + } + } +} + +impl PartialEq for Document { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl PartialOrd for Document { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Document { + fn cmp(&self, other: &Self) -> Ordering { + let lhs = DocumentScore::new(&self.matches); + let rhs = DocumentScore::new(&other.matches); + lhs.cmp(&rhs) + } +} + +#[derive(Debug, Default, Eq, PartialEq, PartialOrd)] +struct DocumentScore { + typo: usize, + words: usize, + proximity: usize, + attribute: usize, + words_position: usize, +} + +impl Ord for DocumentScore { + fn cmp(&self, other: &Self) -> Ordering { + self.typo.cmp(&other.typo) + .then(self.words.cmp(&other.words).reverse()) + .then(self.proximity.cmp(&other.proximity)) + .then(self.attribute.cmp(&other.attribute)) + .then(self.words_position.cmp(&other.words_position)) + // ~exact~ (see prefix option of the `DFA` builder) + } +} + +fn min_attribute(matches: &[Match]) -> usize { + let mut attribute = usize::max_value(); + for match_ in matches { + if match_.attribute == 0 { return 0 } + attribute = cmp::min(match_.attribute as usize, attribute); + } + attribute +} + +fn min_attribute_index(matches: &[Match]) -> usize { + let mut attribute_index = usize::max_value(); + for match_ in matches { + if match_.attribute_index == 0 { return 0 } + attribute_index = cmp::min(match_.attribute_index as usize, attribute_index); + } + attribute_index +} + +impl DocumentScore { + fn new(matches: &[Match]) -> Self { + let mut score = DocumentScore::default(); + + let mut index = 0; // FIXME could be replaced by the `GroupBy::remaining` method + for group in GroupBy::new(matches, |a, b| a.query_index == b.query_index) { + index += group.len(); + + score.typo = cmp::max(group[0].distance as usize, score.typo); + score.words += 1; + + // FIXME distance is wrong if 2 different attributes matches + if let Some(first_next_group) = (&matches[index..]).first() { + score.proximity += attribute_proximity(first_next_group, &group[0]); + } + + score.attribute += min_attribute(group); + score.words_position += min_attribute_index(group); + } + + score + } +} + +fn proximity(first: usize, second: usize) -> usize { + if first < second { + cmp::min(second - first, MAX_DISTANCE) + } else { + cmp::min(first - second, MAX_DISTANCE) + 1 + } +} + +fn attribute_proximity(lhs: &Match, rhs: &Match) -> usize { + if lhs.attribute != rhs.attribute { + MAX_DISTANCE + } else { + let lhs_attr = lhs.attribute_index as usize; + let rhs_attr = rhs.attribute_index as usize; + proximity(lhs_attr, rhs_attr) + } +} + +pub struct Pool { + returned_documents: HashSet, + documents: Vec, + limitation: Limitation, +} + +#[derive(Debug, Copy, Clone)] +enum Limitation { + /// No limitation is specified. + Unspecified { + query_size: usize, + }, + + /// The limitation is specified but not reached. + Specified { + /// The maximum number of results to return. + limit: usize, + + /// documents with a distance of zero which can be used + /// in the step-by-step sort-and-return. + /// + /// this field must be equal to the limit to reach + /// the limitation + matching_documents: usize, + }, + + /// No more documents with a distance of zero + /// can never be returned now. + Reached { + /// The number of remaining documents to return in order. + remaining: usize, + }, +} + +impl Limitation { + fn reached(&self) -> Option { + match self { + Limitation::Reached { remaining } => Some(*remaining), + _ => None, + } + } + + fn is_reached(&self) -> bool { + self.reached().is_some() + } + + fn query_size(&self) -> usize { + match *self { + Limitation::Unspecified { query_size } => query_size, + _ => 1, + } + } +} + +impl Pool { + pub fn new(query_size: usize) -> Self { + Self { + returned_documents: HashSet::new(), + documents: Vec::new(), + limitation: Limitation::Unspecified { query_size }, + } + } + + pub fn with_output_limit(query_size: usize, limit: usize) -> Self { + assert_eq!(query_size, 1, "limit can only be specified if the query size is 1"); + Self { + returned_documents: HashSet::new(), + documents: Vec::new(), + limitation: Limitation::Specified { + limit: limit, + matching_documents: 0, + }, + } + } + + pub fn extend(&mut self, mut matches: HashMap>) { + for doc in self.documents.iter_mut() { + if let Some(matches) = matches.remove(&doc.document_id) { + doc.matches.extend(matches); + doc.matches.sort_unstable(); + } + } + + matches.retain(|id, _| !self.returned_documents.contains(id)); + self.documents.reserve(matches.len()); + + let mut new_matches = 0; + for (id, mut matches) in matches.into_iter() { + matches.sort_unstable(); + if matches[0].distance == 0 { new_matches += 1 } + + if self.limitation.is_reached() { + match matches.iter().position(|match_| match_.distance > 0) { + Some(pos) if pos == 0 => continue, + Some(pos) => matches.truncate(pos), + None => (), + } + } + + let document = Document::from_sorted_matches(id, matches); + self.documents.push(document); + } + self.documents.sort_unstable(); + + self.limitation = match self.limitation { + Limitation::Specified { limit, matching_documents } if matching_documents + new_matches >= limit => { + // this is the biggest valid match + // used to find the next smallest invalid match + let biggest_valid = Match { query_index: 0, distance: 0, ..Match::max() }; + + // documents which does not have a match with a distance of 0 can be removed. + // note that documents have a query size of 1. + match self.documents.binary_search_by(|d| d.matches[0].cmp(&biggest_valid)) { + Ok(index) => self.documents.truncate(index + 1), // this will never happen :) + Err(index) => self.documents.truncate(index), + } + + Limitation::Reached { remaining: limit } + }, + Limitation::Specified { limit, matching_documents } => { + Limitation::Specified { + limit: limit, + matching_documents: matching_documents + new_matches + } + }, + limitation => limitation, + }; + } +} + +impl IntoIterator for Pool { + type Item = Document; + type IntoIter = vec::IntoIter; + + fn into_iter(mut self) -> Self::IntoIter { + match self.limitation { + Limitation::Unspecified { .. } => self.documents.into_iter(), + Limitation::Specified { limit, .. } => { + self.documents.truncate(limit); + self.documents.into_iter() + }, + Limitation::Reached { remaining } => { + self.documents.truncate(remaining); + self.documents.into_iter() + }, + } + } +} + +pub enum RankedStream<'m, 'v> { + Fed { + inner: UnionWithState<'m, 'v, DocIndex, u32>, + automatons: Vec, + pool: Pool, + }, + Pours { + inner: vec::IntoIter, + }, +} + +impl<'m, 'v> RankedStream<'m, 'v> { + pub fn new(map: &'m DocIndexMap, values: &'v Values, automatons: Vec) -> Self { + let mut op = OpWithStateBuilder::new(values); + + for automaton in automatons.iter().cloned() { + let stream = map.as_map().search(automaton).with_state(); + op.push(stream); + } + + let pool = match automatons.len() { + 1 => Pool::with_output_limit(automatons.len(), 20), + _ => Pool::new(automatons.len()), + }; + + RankedStream::Fed { + inner: op.union(), + automatons: automatons, + pool: pool, + } + } +} + +impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { + type Item = DocumentId; + + fn next(&'a mut self) -> Option { + loop { + // TODO remove that when NLL are here ! + let mut transfert_pool = None; + + match self { + RankedStream::Fed { inner, automatons, pool } => { + match inner.next() { + Some((_string, indexed_values)) => { + for iv in indexed_values { + + let distance = automatons[iv.index].distance(iv.state).to_u8(); + + // TODO remove the Pool system ! + // this is an internal Pool rule but + // it is more efficient to test that here + if pool.limitation.reached().is_some() && distance != 0 { continue } + + let mut matches = HashMap::with_capacity(iv.values.len() / 2); + for di in iv.values { + let match_ = Match { + query_index: iv.index as u32, + distance: distance, + attribute: di.attribute, + attribute_index: di.attribute_index, + }; + matches.entry(di.document) + .and_modify(|matches: &mut Vec<_>| matches.push(match_)) + .or_insert_with(|| vec![match_]); + } + pool.extend(matches); + } + }, + None => { + transfert_pool = Some(mem::replace(pool, Pool::new(0))); + }, + } + }, + RankedStream::Pours { inner } => { + return inner.next().map(|d| d.document_id) + }, + } + + // transform the `RankedStream` into a `Pours` + if let Some(pool) = transfert_pool { + *self = RankedStream::Pours { + inner: pool.into_iter(), + } + } + } + } +}