From 26dcfe1e541f487efe6321038c1658bcfe1aaaaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 21 Oct 2018 16:42:19 +0200 Subject: [PATCH] fix: Remove stop-words from the serve examples --- examples/serve-console.rs | 18 +++++------------- examples/serve-http.rs | 18 ++++-------------- src/automaton.rs | 33 +++++++++++++++++++++++++++++---- src/rank/criterion/mod.rs | 2 ++ src/rank/mod.rs | 5 +++++ src/rank/ranked_stream.rs | 21 +++++++++++---------- 6 files changed, 56 insertions(+), 41 deletions(-) diff --git a/examples/serve-console.rs b/examples/serve-console.rs index 408a4b3b5..5196bc8b0 100644 --- a/examples/serve-console.rs +++ b/examples/serve-console.rs @@ -10,25 +10,18 @@ use pentium::{automaton, DocumentId, Metadata}; #[derive(Debug, StructOpt)] pub struct CommandConsole { - /// The stop word file, each word must be separated by a newline. - #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words: PathBuf, - /// Meta file name (e.g. relaxed-colden). #[structopt(parse(from_os_str))] pub meta_name: PathBuf, } pub struct ConsoleSearch { - common_words: CommonWords, metadata: Metadata, db: DB, } impl ConsoleSearch { pub fn from_command(command: CommandConsole) -> io::Result { - let common_words = CommonWords::from_file(command.stop_words)?; - let map_file = command.meta_name.with_extension("map"); let idx_file = command.meta_name.with_extension("idx"); let sst_file = command.meta_name.with_extension("sst"); @@ -42,7 +35,7 @@ impl ConsoleSearch { drop(db); let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap(); - Ok(ConsoleSearch { common_words, metadata, db }) + Ok(ConsoleSearch { metadata, db }) } pub fn serve(self) { @@ -52,20 +45,19 @@ impl ConsoleSearch { let mut query = String::new(); io::stdin().read_line(&mut query).unwrap(); - let query = query.trim().to_lowercase(); if query.is_empty() { break } - let (elapsed, _) = measure_time(|| search(&self.metadata, &self.db, &self.common_words, &query)); + let (elapsed, _) = measure_time(|| search(&self.metadata, &self.db, &query)); println!("Finished in {}", elapsed); } } } -fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) { +fn search(metadata: &Metadata, database: &DB, query: &str) { let mut automatons = Vec::new(); - for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) { - let lev = automaton::build(query); + for query in query.split_whitespace().map(str::to_lowercase) { + let lev = automaton::build_prefix_dfa(&query); automatons.push(lev); } diff --git a/examples/serve-http.rs b/examples/serve-http.rs index e37d094d1..3ef5244da 100644 --- a/examples/serve-http.rs +++ b/examples/serve-http.rs @@ -19,10 +19,6 @@ pub struct CommandHttp { #[structopt(short = "l", default_value = "127.0.0.1:3030")] pub listen_addr: SocketAddr, - /// The stop word file, each word must be separated by a newline. - #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words: PathBuf, - /// Meta file name (e.g. relaxed-colden). #[structopt(parse(from_os_str))] pub meta_name: PathBuf, @@ -41,15 +37,12 @@ struct SearchQuery { q: String } pub struct HttpServer { listen_addr: SocketAddr, - common_words: Arc, metadata: Arc, db: Arc, } impl HttpServer { pub fn from_command(command: CommandHttp) -> io::Result { - let common_words = CommonWords::from_file(command.stop_words)?; - let map_file = command.meta_name.with_extension("map"); let idx_file = command.meta_name.with_extension("idx"); let sst_file = command.meta_name.with_extension("sst"); @@ -64,19 +57,18 @@ impl HttpServer { Ok(HttpServer { listen_addr: command.listen_addr, - common_words: Arc::new(common_words), metadata: Arc::new(metadata), db: Arc::new(db), }) } pub fn serve(self) { - let HttpServer { listen_addr, common_words, metadata, db } = self; + let HttpServer { listen_addr, metadata, db } = self; let routes = warp::path("search") .and(warp::query()) .map(move |query: SearchQuery| { - let body = search(metadata.clone(), db.clone(), common_words.clone(), &query.q).unwrap(); + let body = search(metadata.clone(), db.clone(), &query.q).unwrap(); body }) .with(warp::reply::with::header("Content-Type", "application/json")) @@ -86,15 +78,13 @@ impl HttpServer { } } -fn search(metadata: M, database: D, common_words: C, query: &str) -> Result> +fn search(metadata: M, database: D, query: &str) -> Result> where M: AsRef, D: AsRef, - C: AsRef, { let mut automatons = Vec::new(); for query in query.split_whitespace().map(str::to_lowercase) { - if common_words.as_ref().contains(&query) { continue } - let lev = automaton::build(&query); + let lev = automaton::build_prefix_dfa(&query); automatons.push(lev); } diff --git a/src/automaton.rs b/src/automaton.rs index b3815d301..90ceb407a 100644 --- a/src/automaton.rs +++ b/src/automaton.rs @@ -1,4 +1,5 @@ use std::ops::Deref; + use fst::Automaton; use levenshtein_automata::{ LevenshteinAutomatonBuilder as LevBuilder, @@ -50,16 +51,40 @@ impl AutomatonExt for DfaExt { } } -pub fn build(query: &str) -> DfaExt { +enum PrefixSetting { + Prefix, + NoPrefix, +} + +fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt { + use self::PrefixSetting::{Prefix, NoPrefix}; + let dfa = match query.len() { - 0 ..= 4 => LEVDIST0.build_prefix_dfa(query), - 5 ..= 8 => LEVDIST1.build_prefix_dfa(query), - _ => LEVDIST2.build_prefix_dfa(query), + 0 ..= 4 => match setting { + Prefix => LEVDIST0.build_prefix_dfa(query), + NoPrefix => LEVDIST0.build_dfa(query), + }, + 5 ..= 8 => match setting { + Prefix => LEVDIST1.build_prefix_dfa(query), + NoPrefix => LEVDIST1.build_dfa(query), + }, + _ => match setting { + Prefix => LEVDIST2.build_prefix_dfa(query), + NoPrefix => LEVDIST2.build_dfa(query), + }, }; DfaExt { query_len: query.len(), automaton: dfa } } +pub fn build_prefix_dfa(query: &str) -> DfaExt { + build_dfa_with_setting(query, PrefixSetting::Prefix) +} + +pub fn build_dfa(query: &str) -> DfaExt { + build_dfa_with_setting(query, PrefixSetting::NoPrefix) +} + pub trait AutomatonExt: Automaton { fn eval>(&self, s: B) -> Distance; fn query_len(&self) -> usize; diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index 79143cd8d..bf54e9863 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -57,6 +57,8 @@ impl Criterion for DocumentId { } } +// TODO there is too much Box here, can we use +// static references or static closures pub fn default() -> Vec> { vec![ Box::new(SumOfTypos), diff --git a/src/rank/mod.rs b/src/rank/mod.rs index 413c7566e..39edb8976 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -21,6 +21,11 @@ impl Document { unsafe { Self::from_sorted_matches(doc, vec![match_]) } } + pub fn from_matches(doc: DocumentId, mut matches: Vec) -> Self { + matches.sort_unstable(); + unsafe { Self::from_sorted_matches(doc, matches) } + } + pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec) -> Self { Self { id, matches } } diff --git a/src/rank/ranked_stream.rs b/src/rank/ranked_stream.rs index e3b28f25e..c0e39f36f 100644 --- a/src/rank/ranked_stream.rs +++ b/src/rank/ranked_stream.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::hash::Hash; use std::ops::Range; use std::rc::Rc; -use std::{mem, vec, cmp}; +use std::{mem, vec}; use fnv::FnvHashMap; use fst::Streamer; @@ -11,10 +11,17 @@ use group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt}; use crate::metadata::Metadata; use crate::metadata::ops::OpBuilder; -use crate::rank::criterion::{self, Criterion}; +use crate::rank::criterion::Criterion; use crate::rank::Document; use crate::{Match, DocumentId}; +fn clamp_range(range: Range, big: Range) -> Range { + Range { + start: range.start.min(big.end).max(big.start), + end: range.end.min(big.end).max(big.start), + } +} + pub struct Config<'m, C, F> { pub metadata: &'m Metadata, pub automatons: Vec, @@ -67,10 +74,7 @@ impl<'m, C, F> RankedStream<'m, C, F> { } } - matches.into_iter().map(|(id, mut matches)| { - matches.sort_unstable(); - unsafe { Document::from_sorted_matches(id, matches) } - }).collect() + matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect() } } @@ -92,10 +96,7 @@ where C: Criterion } } - let range = Range { - start: cmp::min(range.start, documents.len()), - end: cmp::min(range.end, documents.len()), - }; + let range = clamp_range(range, 0..documents.len()); documents[range].to_vec() }