From c94bc59d7e70991ace9b4371381b277d2f2c42e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 5 Nov 2020 13:34:15 +0100
Subject: [PATCH 1/3] Introduce a function to transform an obk into a JSON

---
 src/lib.rs | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
diff --git a/src/lib.rs b/src/lib.rs
index edeee2563..173d96737 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,7 +14,9 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::hash::BuildHasherDefault;
 
+use anyhow::Context;
 use fxhash::{FxHasher32, FxHasher64};
+use serde_json::{Map, Value};
 
 pub use self::criterion::{Criterion, default_criteria};
 pub use self::fields_ids_map::FieldsIdsMap;
@@ -38,3 +40,21 @@ pub type Attribute = u32;
 pub type Position = u32;
 
 type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;
+
+/// Transform a raw obkv store into a JSON Object.
+pub fn obkv_to_json(
+    displayed_fields: &[u8],
+    fields_ids_map: &FieldsIdsMap,
+    obkv: obkv::KvReader,
+) -> anyhow::Result<Map<String, Value>>
+{
+    displayed_fields.iter()
+        .copied()
+        .flat_map(|id| obkv.get(id).map(|value| (id, value)))
+        .map(|(id, value)| {
+            let name = fields_ids_map.name(id).context("unknown obkv field id")?;
+            let value = serde_json::from_slice(value)?;
+            Ok((name.to_owned(), value))
+        })
+        .collect()
+}

From 640c7d748aa956329134955cd2600a593c95997c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 5 Nov 2020 13:58:07 +0100
Subject: [PATCH 2/3] Modify the highlight function to support any JSON type

---
 Cargo.lock          |  2 --
 Cargo.toml          |  1 -
 http-ui/Cargo.lock  |  1 -
 http-ui/Cargo.toml  |  1 -
 http-ui/src/main.rs | 81 +++++++++++++++++++++++++++++----------------
 5 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0d16ce9f7..63e30b65c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -447,7 +447,6 @@ checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2"
 dependencies = [
  "autocfg",
  "hashbrown",
- "serde",
 ]
 
 [[package]]
@@ -604,7 +603,6 @@ dependencies = [
  "grenad",
  "heed",
  "human_format",
- "indexmap",
  "itertools",
  "jemallocator",
  "levenshtein_automata",
diff --git a/Cargo.toml b/Cargo.toml
index 70a65623f..441397275 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,6 @@ fxhash = "0.2.1"
 grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
 heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
 human_format = "1.0.3"
-indexmap = { version = "1.6.0", features = ["serde-1"] }
 jemallocator = "0.3.2"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
 linked-hash-map = "0.5.3"
diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock
index e3c265377..b6539085f 100644
--- a/http-ui/Cargo.lock
+++ b/http-ui/Cargo.lock
@@ -731,7 +731,6 @@ dependencies = [
  "futures",
  "grenad",
  "heed",
- "indexmap",
  "log",
  "memmap",
  "milli",
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 184e05a98..7e28e1211 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -9,7 +9,6 @@ edition = "2018"
 anyhow = "1.0.28"
 grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
 heed = "0.10.1"
-indexmap = "1.6.0"
 memmap = "0.7.0"
 milli = { path = ".." }
 once_cell = "1.4.1"
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 2a9f9de69..bfd326837 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -14,10 +14,10 @@ use futures::stream;
 use futures::{FutureExt, StreamExt};
 use grenad::CompressionType;
 use heed::EnvOpenOptions;
-use indexmap::IndexMap;
 use once_cell::sync::OnceCell;
 use rayon::ThreadPool;
 use serde::{Serialize, Deserialize, Deserializer};
+use serde_json::{Map, Value};
 use structopt::StructOpt;
 use tokio::fs::File as TFile;
 use tokio::io::AsyncWriteExt;
@@ -27,7 +27,7 @@ use warp::{Filter, http::Response};
 
 use milli::tokenizer::{simple_tokenizer, TokenType};
 use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
-use milli::{Index, UpdateStore, SearchResult};
+use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
 
 static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
 
@@ -117,19 +117,49 @@ pub struct IndexerOpt {
     pub indexing_jobs: Option<usize>,
 }
 
-fn highlight_record(record: &mut IndexMap<String, String>, words: &HashSet<String>) {
-    for (_key, value) in record.iter_mut() {
-        let old_value = mem::take(value);
-        for (token_type, token) in simple_tokenizer(&old_value) {
-            if token_type == TokenType::Word {
-                let lowercase_token = token.to_lowercase();
-                let to_highlight = words.contains(&lowercase_token);
-                if to_highlight { value.push_str("<mark>") }
-                value.push_str(token);
-                if to_highlight { value.push_str("</mark>") }
-            } else {
-                value.push_str(token);
-            }
+fn highlight_record(
+    object: &mut Map<String, Value>,
+    words_to_highlight: &HashSet<String>,
+    attributes_to_highlight: &HashSet<String>,
+) {
+    // TODO do we need to create a string for element that are not and needs to be highlight?
+    fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value {
+        match value {
+            Value::Null => Value::Null,
+            Value::Bool(boolean) => Value::Bool(boolean),
+            Value::Number(number) => Value::Number(number),
+            Value::String(old_string) => {
+                let mut string = String::new();
+                for (token_type, token) in simple_tokenizer(&old_string) {
+                    if token_type == TokenType::Word {
+                        let lowercase_token = token.to_lowercase();
+                        let to_highlight = words_to_highlight.contains(&lowercase_token);
+                        if to_highlight { string.push_str("<mark>") }
+                        string.push_str(token);
+                        if to_highlight { string.push_str("</mark>") }
+                    } else {
+                        string.push_str(token);
+                    }
+                }
+                Value::String(string)
+            },
+            Value::Array(values) => {
+                Value::Array(values.into_iter()
+                    .map(|v| highlight_value(v, words_to_highlight))
+                    .collect())
+            },
+            Value::Object(object) => {
+                Value::Object(object.into_iter()
+                    .map(|(k, v)| (k, highlight_value(v, words_to_highlight)))
+                    .collect())
+            },
+        }
+    }
+
+    for (key, value) in object.iter_mut() {
+        if attributes_to_highlight.contains(key) {
+            let old_value = mem::take(value);
+            *value = highlight_value(old_value, words_to_highlight);
         }
     }
 }
@@ -517,23 +547,18 @@ async fn main() -> anyhow::Result<()> {
                 Some(fields) => Cow::Borrowed(fields),
                 None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()),
             };
+            let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() {
+                Some(fields) => fields.iter().flat_map(|id| fields_ids_map.name(*id)).map(ToOwned::to_owned).collect(),
+                None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
+            };
 
-            for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
-                let mut record = displayed_fields.iter()
-                    .flat_map(|&id| record.get(id).map(|val| (id, val)))
-                    .map(|(key_id, value)| {
-                        let key = fields_ids_map.name(key_id).unwrap().to_owned();
-                        // TODO we must deserialize a Json Value and highlight it.
-                        let value = serde_json::from_slice(value).unwrap();
-                        (key, value)
-                    })
-                    .collect();
-
+            for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
+                let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
                 if !disable_highlighting {
-                    highlight_record(&mut record, &found_words);
+                    highlight_record(&mut object, &found_words, &attributes_to_highlight);
                 }
 
-                documents.push(record);
+                documents.push(object);
             }
 
             Response::builder()

From 4fb138c42eb498ee5ea28808d5fe76287dd61fe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 6 Nov 2020 16:15:07 +0100
Subject: [PATCH 3/3] Make sure we index all kind of JSON types

---
 src/lib.rs                          | 83 +++++++++++++++++++++++++++++
 src/update/index_documents/mod.rs   | 37 +++++++++++++
 src/update/index_documents/store.rs | 35 ++++++------
 3 files changed, 135 insertions(+), 20 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 173d96737..bea05e68a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -58,3 +58,86 @@ pub fn obkv_to_json(
         })
         .collect()
 }
+
+/// Transform a JSON value into a string that can be indexed.
+pub fn json_to_string(value: Value) -> Option<String> {
+
+    fn inner(value: Value, output: &mut String) -> bool {
+        use std::fmt::Write;
+        match value {
+            Value::Null => false,
+            Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
+            Value::Number(number) => write!(output, "{}", number).is_ok(),
+            Value::String(string) => write!(output, "{}", string).is_ok(),
+            Value::Array(array) => {
+                let mut count = 0;
+                for value in array {
+                    if inner(value, output) {
+                        output.push_str(". ");
+                        count += 1;
+                    }
+                }
+                // check that at least one value was written
+                count != 0
+            },
+            Value::Object(object) => {
+                let mut buffer = String::new();
+                let mut count = 0;
+                for (key, value) in object {
+                    buffer.clear();
+                    let _ = write!(&mut buffer, "{}: ", key);
+                    if inner(value, &mut buffer) {
+                        buffer.push_str(". ");
+                        // We write the "key: value. " pair only when
+                        // we are sure that the value can be written.
+                        output.push_str(&buffer);
+                        count += 1;
+                    }
+                }
+                // check that at least one value was written
+                count != 0
+            },
+        }
+    }
+
+    let mut string = String::new();
+    if inner(value, &mut string) {
+        Some(string)
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn json_to_string_object() {
+        let value = json!({
+            "name": "John Doe",
+            "age": 43,
+            "not_there": null,
+        });
+
+        let string = json_to_string(value).unwrap();
+        assert_eq!(string, "name: John Doe. age: 43. ");
+    }
+
+    #[test]
+    fn json_to_string_array() {
+        let value = json!([
+            { "name": "John Doe" },
+            43,
+            "hello",
+            [ "I", "am", "fine" ],
+            null,
+        ]);
+
+        let string = json_to_string(value).unwrap();
+        // We don't care about having two point (.) after the other as
+        // the distance of hard separators is clamped to 8 anyway.
+        assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
+    }
+}
diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs
index 82582dac8..5259ec662 100644
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@@ -901,4 +901,41 @@ mod tests {
         assert_eq!(count, 1);
         drop(rtxn);
     }
+
+    #[test]
+    fn complex_json_documents() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with an id for only one of them.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &br#"[
+            { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
+            { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
+            { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
+        ]"#[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is 1 documents now.
+        let rtxn = index.read_txn().unwrap();
+
+        // Search for a sub object value
+        let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![0]);
+
+        // Search for a sub array value
+        let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![1]);
+
+        // Search for a sub array sub object key
+        let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![2]);
+
+        drop(rtxn);
+    }
 }
diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs
index 43e37d9cd..1503ef5fb 100644
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@@ -1,4 +1,3 @@
-use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::convert::{TryFrom, TryInto};
 use std::fs::File;
@@ -17,7 +16,7 @@ use tempfile::tempfile;
 
 use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::tokenizer::{simple_tokenizer, only_token};
-use crate::{SmallVec32, Position, DocumentId};
+use crate::{json_to_string, SmallVec32, Position, DocumentId};
 
 use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
 use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
@@ -317,25 +316,21 @@ impl Store {
                 }
 
                 for (attr, content) in document.iter() {
-                    if self.searchable_fields.contains(&attr) {
-                        use serde_json::Value;
-                        let content: Cow<str> = match serde_json::from_slice(content) {
-                            Ok(string) => string,
-                            Err(_) => match serde_json::from_slice(content)? {
-                                Value::Null => continue,
-                                Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
-                                Value::Number(number) => Cow::Owned(number.to_string()),
-                                Value::String(string) => Cow::Owned(string),
-                                Value::Array(_array) => continue,
-                                Value::Object(_object) => continue,
-                            }
-                        };
+                    if !self.searchable_fields.contains(&attr) {
+                        continue;
+                    }
 
-                        for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
-                            let word = token.to_lowercase();
-                            let position = (attr as usize * MAX_POSITION + pos) as u32;
-                            words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
-                        }
+                    let value = serde_json::from_slice(content)?;
+                    let content = match json_to_string(value) {
+                        Some(content) => content,
+                        None => continue,
+                    };
+
+                    let tokens = simple_tokenizer(&content).filter_map(only_token);
+                    for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
+                        let word = token.to_lowercase();
+                        let position = (attr as usize * MAX_POSITION + pos) as u32;
+                        words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
                     }
                 }