diff --git a/Cargo.lock b/Cargo.lock index 8d9370fe8..576db9e62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,15 +6,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" -[[package]] -name = "aho-corasick" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" -dependencies = [ - "memchr", -] - [[package]] name = "anyhow" version = "1.0.31" @@ -997,7 +988,6 @@ dependencies = [ "once_cell", "oxidized-mtbl", "rayon", - "regex", "roaring", "serde", "slice-group-by", @@ -1601,10 +1591,7 @@ version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ - "aho-corasick", - "memchr", "regex-syntax", - "thread_local 1.0.1", ] [[package]] @@ -1822,7 +1809,7 @@ dependencies = [ "chrono", "log 0.4.8", "termcolor", - "thread_local 0.3.4", + "thread_local", ] [[package]] @@ -1937,15 +1924,6 @@ dependencies = [ "unreachable", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static 1.4.0", -] - [[package]] name = "time" version = "0.1.43" diff --git a/Cargo.toml b/Cargo.toml index e11e31158..d7961a46c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,9 +28,6 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" -# to highlight the documents -regex = "1.3.9" - # logging log = "0.4.8" stderrlog = "0.4.3" diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 21470f84e..1d0dc6277 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashSet; use std::fs::File; use std::net::SocketAddr; use std::path::PathBuf; @@ -7,8 +8,8 @@ use std::time::Instant; use askama_warp::Template; use heed::EnvOpenOptions; -use regex::Regex; use serde::Deserialize; +use slice_group_by::StrGroupBy; use structopt::StructOpt; use warp::{Filter, http::Response}; @@ -44,6 +45,18 @@ struct Opt { http_listen_addr: String, } +fn highlight_string(string: &str, words: &HashSet) -> String { + let mut output = String::new(); + for token in string.linear_group_by_key(|c| c.is_alphanumeric()) { + let lowercase_token = token.to_lowercase(); + let to_highlight = words.contains(&lowercase_token); + if to_highlight { output.push_str("") } + output.push_str(token); + if to_highlight { output.push_str("") } + } + output +} + #[derive(Template)] #[template(path = "index.html")] struct IndexTemplate { @@ -173,15 +186,6 @@ async fn main() -> anyhow::Result<()> { // We write the headers body.extend_from_slice(headers); - let mut regex = format!(r"(?i)\b("); - let number_of_words = words.len(); - words.into_iter().enumerate().for_each(|(i, w)| { - regex.push_str(&w); - if i != number_of_words - 1 { regex.push('|') } - }); - regex.push_str(r")\b"); - let re = Regex::new(®ex).unwrap(); - for id in documents_ids { let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap(); let content = content.expect(&format!("could not find document {}", id)); @@ -190,7 +194,7 @@ async fn main() -> anyhow::Result<()> { let content = if disable_highlighting { Cow::from(content) } else { - re.replace_all(content, "$1") + Cow::from(highlight_string(content, &words)) }; body.extend_from_slice(content.as_bytes());