From 4f7f7538f7b9a6d0b619f251d2f2af4ed926a641 Mon Sep 17 00:00:00 2001
From: mpostma <postma.marin@protonmail.com>
Date: Wed, 23 Dec 2020 20:04:19 +0100
Subject: [PATCH] highlight with new tokenizer

---
 Cargo.lock                          |  2 +-
 http-ui/Cargo.lock                  |  4 +-
 http-ui/Cargo.toml                  |  1 +
 http-ui/src/main.rs                 | 60 ++++++++++++++++++-----------
 src/query_tokens.rs                 |  4 +-
 src/update/index_documents/store.rs |  8 ++--
 6 files changed, 49 insertions(+), 30 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 843bba4e4..4a46d30a8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -711,7 +711,7 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
 [[package]]
 name = "meilisearch-tokenizer"
 version = "0.1.1"
-source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
+source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850"
 dependencies = [
  "character_converter",
  "cow-utils",
diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock
index cc2e2f852..b00e234ab 100644
--- a/http-ui/Cargo.lock
+++ b/http-ui/Cargo.lock
@@ -803,10 +803,12 @@ dependencies = [
  "byte-unit",
  "bytes",
  "flate2",
+ "fst",
  "futures",
  "grenad",
  "heed",
  "log",
+ "meilisearch-tokenizer",
  "memmap",
  "milli",
  "once_cell",
@@ -1039,7 +1041,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
 [[package]]
 name = "meilisearch-tokenizer"
 version = "0.1.1"
-source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646"
+source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
 dependencies = [
  "character_converter",
  "cow-utils",
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index ba094c79e..5fcdf9caf 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -32,3 +32,4 @@ warp = "0.2.2"
 # logging
 log = "0.4.11"
 stderrlog = "0.5.0"
+fst = "0.4.5"
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 51e6e9f85..f6a975e7c 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -27,8 +27,9 @@ use tokio::io::AsyncWriteExt;
 use tokio::sync::broadcast;
 use warp::filters::ws::Message;
 use warp::{Filter, http::Response};
+use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
+use fst::Set;
 
-use milli::tokenizer::{simple_tokenizer, TokenType};
 use milli::update::UpdateIndexingStep::*;
 use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
 use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
@@ -121,49 +122,61 @@ pub struct IndexerOpt {
     pub indexing_jobs: Option<usize>,
 }
 
-fn highlight_record(
-    object: &mut Map<String, Value>,
-    words_to_highlight: &HashSet<String>,
-    attributes_to_highlight: &HashSet<String>,
-) {
-    // TODO do we need to create a string for element that are not and needs to be highlight?
-    fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value {
+struct Highlighter<'a, A> {
+    analyzer: Analyzer<'a, A>,
+}
+
+impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
+    fn new(stop_words: &'a fst::Set<A>) -> Self {
+        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        Self { analyzer }
+    }
+
+    fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
         match value {
             Value::Null => Value::Null,
             Value::Bool(boolean) => Value::Bool(boolean),
             Value::Number(number) => Value::Number(number),
             Value::String(old_string) => {
                 let mut string = String::new();
-                for (token_type, token) in simple_tokenizer(&old_string) {
-                    if token_type == TokenType::Word {
-                        let lowercase_token = token.to_lowercase();
-                        let to_highlight = words_to_highlight.contains(&lowercase_token);
+                let analyzed = self.analyzer.analyze(&old_string);
+                for (word, token) in analyzed.reconstruct() {
+                    if token.is_word() {
+                        let to_highlight = words_to_highlight.contains(token.text());
                         if to_highlight { string.push_str("<mark>") }
-                        string.push_str(token);
+                        string.push_str(word);
                         if to_highlight { string.push_str("</mark>") }
                     } else {
-                        string.push_str(token);
+                        string.push_str(word);
                     }
                 }
                 Value::String(string)
             },
             Value::Array(values) => {
                 Value::Array(values.into_iter()
-                    .map(|v| highlight_value(v, words_to_highlight))
+                    .map(|v| self.highlight_value(v, words_to_highlight))
                     .collect())
             },
             Value::Object(object) => {
                 Value::Object(object.into_iter()
-                    .map(|(k, v)| (k, highlight_value(v, words_to_highlight)))
+                    .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
                     .collect())
             },
         }
     }
 
-    for (key, value) in object.iter_mut() {
-        if attributes_to_highlight.contains(key) {
-            let old_value = mem::take(value);
-            *value = highlight_value(old_value, words_to_highlight);
+    fn highlight_record(
+        &self,
+        object: &mut Map<String, Value>,
+        words_to_highlight: &HashSet<String>,
+        attributes_to_highlight: &HashSet<String>,
+    ) {
+        // TODO do we need to create a string for element that are not and needs to be highlight?
+        for (key, value) in object.iter_mut() {
+            if attributes_to_highlight.contains(key) {
+                let old_value = mem::take(value);
+                *value = self.highlight_value(old_value, words_to_highlight);
+            }
         }
     }
 }
@@ -651,10 +664,13 @@ async fn main() -> anyhow::Result<()> {
                 None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
             };
 
+            let stop_words = fst::Set::default();
+            let highlighter = Highlighter::new(&stop_words);
+
             for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
                 let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
                 if !disable_highlighting {
-                    highlight_record(&mut object, &found_words, &attributes_to_highlight);
+                    highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
                 }
 
                 documents.push(object);
@@ -716,7 +732,7 @@ async fn main() -> anyhow::Result<()> {
         }
 
         let file = file.into_std().await;
-        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
+        let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") };
 
         let method = match update_method.as_deref() {
             Some("replace") => String::from("replace"),
diff --git a/src/query_tokens.rs b/src/query_tokens.rs
index ee15b15ea..258c90765 100644
--- a/src/query_tokens.rs
+++ b/src/query_tokens.rs
@@ -54,8 +54,8 @@ mod tests {
             match $test {
                 Quoted(val) => assert_eq!(val.text(), $val),
                 Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
-                }
-            };
+            }
+        };
 
         ($test:expr, Free($val:literal)) => {
             match $test {
diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs
index 2b57d3b8d..96d1098f9 100644
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@@ -8,21 +8,21 @@ use std::{cmp, iter};
 
 use anyhow::{bail, Context};
 use bstr::ByteSlice as _;
+use fst::Set;
 use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
 use heed::BytesEncode;
 use linked_hash_map::LinkedHashMap;
 use log::{debug, info};
+use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
 use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 use serde_json::Value;
 use tempfile::tempfile;
-use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind};
-use fst::Set;
 
 use crate::facet::FacetType;
-use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
 use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
+use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::update::UpdateIndexingStep;
 use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
 
@@ -167,7 +167,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
             // MTBL writers
             docid_word_positions_writer,
             documents_writer,
-            //tokenizer
+            // tokenizer
             analyzer,
         })
     }