From 4f7f7538f7b9a6d0b619f251d2f2af4ed926a641 Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 23 Dec 2020 20:04:19 +0100 Subject: [PATCH] highlight with new tokenizer --- Cargo.lock | 2 +- http-ui/Cargo.lock | 4 +- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 60 ++++++++++++++++++----------- src/query_tokens.rs | 4 +- src/update/index_documents/store.rs | 8 ++-- 6 files changed, 49 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 843bba4e4..4a46d30a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -711,7 +711,7 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index cc2e2f852..b00e234ab 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -803,10 +803,12 @@ dependencies = [ "byte-unit", "bytes", "flate2", + "fst", "futures", "grenad", "heed", "log", + "meilisearch-tokenizer", "memmap", "milli", "once_cell", @@ -1039,7 +1041,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index ba094c79e..5fcdf9caf 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -32,3 +32,4 @@ warp = "0.2.2" # logging log = "0.4.11" stderrlog = "0.5.0" +fst = "0.4.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 51e6e9f85..f6a975e7c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -27,8 +27,9 @@ use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; use warp::filters::ws::Message; use warp::{Filter, http::Response}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use fst::Set; -use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; @@ -121,49 +122,61 @@ pub struct IndexerOpt { pub indexing_jobs: Option, } -fn highlight_record( - object: &mut Map, - words_to_highlight: &HashSet, - attributes_to_highlight: &HashSet, -) { - // TODO do we need to create a string for element that are not and needs to be highlight? - fn highlight_value(value: Value, words_to_highlight: &HashSet) -> Value { +struct Highlighter<'a, A> { + analyzer: Analyzer<'a, A>, +} + +impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { + fn new(stop_words: &'a fst::Set) -> Self { + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + Self { analyzer } + } + + fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { let mut string = String::new(); - for (token_type, token) in simple_tokenizer(&old_string) { - if token_type == TokenType::Word { - let lowercase_token = token.to_lowercase(); - let to_highlight = words_to_highlight.contains(&lowercase_token); + let analyzed = self.analyzer.analyze(&old_string); + for (word, token) in analyzed.reconstruct() { + if token.is_word() { + let to_highlight = words_to_highlight.contains(token.text()); if to_highlight { string.push_str("") } - string.push_str(token); + string.push_str(word); if to_highlight { string.push_str("") } } else { - string.push_str(token); + string.push_str(word); } } Value::String(string) }, Value::Array(values) => { Value::Array(values.into_iter() - .map(|v| highlight_value(v, words_to_highlight)) + .map(|v| self.highlight_value(v, words_to_highlight)) .collect()) }, Value::Object(object) => { Value::Object(object.into_iter() - .map(|(k, v)| (k, highlight_value(v, words_to_highlight))) + .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) .collect()) }, } } - for (key, value) in object.iter_mut() { - if attributes_to_highlight.contains(key) { - let old_value = mem::take(value); - *value = highlight_value(old_value, words_to_highlight); + fn highlight_record( + &self, + object: &mut Map, + words_to_highlight: &HashSet, + attributes_to_highlight: &HashSet, + ) { + // TODO do we need to create a string for element that are not and needs to be highlight? + for (key, value) in object.iter_mut() { + if attributes_to_highlight.contains(key) { + let old_value = mem::take(value); + *value = self.highlight_value(old_value, words_to_highlight); + } } } } @@ -651,10 +664,13 @@ async fn main() -> anyhow::Result<()> { None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(), }; + let stop_words = fst::Set::default(); + let highlighter = Highlighter::new(&stop_words); + for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlight_record(&mut object, &found_words, &attributes_to_highlight); + highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); } documents.push(object); @@ -716,7 +732,7 @@ async fn main() -> anyhow::Result<()> { } let file = file.into_std().await; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") }; let method = match update_method.as_deref() { Some("replace") => String::from("replace"), diff --git a/src/query_tokens.rs b/src/query_tokens.rs index ee15b15ea..258c90765 100644 --- a/src/query_tokens.rs +++ b/src/query_tokens.rs @@ -54,8 +54,8 @@ mod tests { match $test { Quoted(val) => assert_eq!(val.text(), $val), Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), - } - }; + } + }; ($test:expr, Free($val:literal)) => { match $test { diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 2b57d3b8d..96d1098f9 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -8,21 +8,21 @@ use std::{cmp, iter}; use anyhow::{bail, Context}; use bstr::ByteSlice as _; +use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind}; -use fst::Set; use crate::facet::FacetType; -use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; +use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; @@ -167,7 +167,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // MTBL writers docid_word_positions_writer, documents_writer, - //tokenizer + // tokenizer analyzer, }) }