From c94bc59d7e70991ace9b4371381b277d2f2c42e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Nov 2020 13:34:15 +0100 Subject: [PATCH 1/3] Introduce a function to transform an obk into a JSON --- src/lib.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index edeee2563..173d96737 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,9 @@ use std::borrow::Cow; use std::collections::HashMap; use std::hash::BuildHasherDefault; +use anyhow::Context; use fxhash::{FxHasher32, FxHasher64}; +use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::fields_ids_map::FieldsIdsMap; @@ -38,3 +40,21 @@ pub type Attribute = u32; pub type Position = u32; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result>; + +/// Transform a raw obkv store into a JSON Object. +pub fn obkv_to_json( + displayed_fields: &[u8], + fields_ids_map: &FieldsIdsMap, + obkv: obkv::KvReader, +) -> anyhow::Result> +{ + displayed_fields.iter() + .copied() + .flat_map(|id| obkv.get(id).map(|value| (id, value))) + .map(|(id, value)| { + let name = fields_ids_map.name(id).context("unknown obkv field id")?; + let value = serde_json::from_slice(value)?; + Ok((name.to_owned(), value)) + }) + .collect() +} From 640c7d748aa956329134955cd2600a593c95997c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Nov 2020 13:58:07 +0100 Subject: [PATCH 2/3] Modify the highlight function to support any JSON type --- Cargo.lock | 2 -- Cargo.toml | 1 - http-ui/Cargo.lock | 1 - http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 81 +++++++++++++++++++++++++++++---------------- 5 files changed, 53 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0d16ce9f7..63e30b65c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -447,7 +447,6 @@ checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" dependencies = [ "autocfg", "hashbrown", - "serde", ] [[package]] @@ -604,7 +603,6 @@ dependencies = [ "grenad", "heed", "human_format", - "indexmap", "itertools", "jemallocator", "levenshtein_automata", diff --git a/Cargo.toml b/Cargo.toml index 70a65623f..441397275 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,6 @@ fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" -indexmap = { version = "1.6.0", features = ["serde-1"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index e3c265377..b6539085f 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -731,7 +731,6 @@ dependencies = [ "futures", "grenad", "heed", - "indexmap", "log", "memmap", "milli", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 184e05a98..7e28e1211 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -9,7 +9,6 @@ edition = "2018" anyhow = "1.0.28" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } heed = "0.10.1" -indexmap = "1.6.0" memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2a9f9de69..bfd326837 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -14,10 +14,10 @@ use futures::stream; use futures::{FutureExt, StreamExt}; use grenad::CompressionType; use heed::EnvOpenOptions; -use indexmap::IndexMap; use once_cell::sync::OnceCell; use rayon::ThreadPool; use serde::{Serialize, Deserialize, Deserializer}; +use serde_json::{Map, Value}; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; @@ -27,7 +27,7 @@ use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{Index, UpdateStore, SearchResult}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -117,19 +117,49 @@ pub struct IndexerOpt { pub indexing_jobs: Option, } -fn highlight_record(record: &mut IndexMap, words: &HashSet) { - for (_key, value) in record.iter_mut() { - let old_value = mem::take(value); - for (token_type, token) in simple_tokenizer(&old_value) { - if token_type == TokenType::Word { - let lowercase_token = token.to_lowercase(); - let to_highlight = words.contains(&lowercase_token); - if to_highlight { value.push_str("") } - value.push_str(token); - if to_highlight { value.push_str("") } - } else { - value.push_str(token); - } +fn highlight_record( + object: &mut Map, + words_to_highlight: &HashSet, + attributes_to_highlight: &HashSet, +) { + // TODO do we need to create a string for element that are not and needs to be highlight? + fn highlight_value(value: Value, words_to_highlight: &HashSet) -> Value { + match value { + Value::Null => Value::Null, + Value::Bool(boolean) => Value::Bool(boolean), + Value::Number(number) => Value::Number(number), + Value::String(old_string) => { + let mut string = String::new(); + for (token_type, token) in simple_tokenizer(&old_string) { + if token_type == TokenType::Word { + let lowercase_token = token.to_lowercase(); + let to_highlight = words_to_highlight.contains(&lowercase_token); + if to_highlight { string.push_str("") } + string.push_str(token); + if to_highlight { string.push_str("") } + } else { + string.push_str(token); + } + } + Value::String(string) + }, + Value::Array(values) => { + Value::Array(values.into_iter() + .map(|v| highlight_value(v, words_to_highlight)) + .collect()) + }, + Value::Object(object) => { + Value::Object(object.into_iter() + .map(|(k, v)| (k, highlight_value(v, words_to_highlight))) + .collect()) + }, + } + } + + for (key, value) in object.iter_mut() { + if attributes_to_highlight.contains(key) { + let old_value = mem::take(value); + *value = highlight_value(old_value, words_to_highlight); } } } @@ -517,23 +547,18 @@ async fn main() -> anyhow::Result<()> { Some(fields) => Cow::Borrowed(fields), None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), }; + let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() { + Some(fields) => fields.iter().flat_map(|id| fields_ids_map.name(*id)).map(ToOwned::to_owned).collect(), + None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(), + }; - for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { - let mut record = displayed_fields.iter() - .flat_map(|&id| record.get(id).map(|val| (id, val))) - .map(|(key_id, value)| { - let key = fields_ids_map.name(key_id).unwrap().to_owned(); - // TODO we must deserialize a Json Value and highlight it. - let value = serde_json::from_slice(value).unwrap(); - (key, value) - }) - .collect(); - + for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { + let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlight_record(&mut record, &found_words); + highlight_record(&mut object, &found_words, &attributes_to_highlight); } - documents.push(record); + documents.push(object); } Response::builder() From 4fb138c42eb498ee5ea28808d5fe76287dd61fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Nov 2020 16:15:07 +0100 Subject: [PATCH 3/3] Make sure we index all kind of JSON types --- src/lib.rs | 83 +++++++++++++++++++++++++++++ src/update/index_documents/mod.rs | 37 +++++++++++++ src/update/index_documents/store.rs | 35 ++++++------ 3 files changed, 135 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 173d96737..bea05e68a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,3 +58,86 @@ pub fn obkv_to_json( }) .collect() } + +/// Transform a JSON value into a string that can be indexed. +pub fn json_to_string(value: Value) -> Option { + + fn inner(value: Value, output: &mut String) -> bool { + use std::fmt::Write; + match value { + Value::Null => false, + Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), + Value::Number(number) => write!(output, "{}", number).is_ok(), + Value::String(string) => write!(output, "{}", string).is_ok(), + Value::Array(array) => { + let mut count = 0; + for value in array { + if inner(value, output) { + output.push_str(". "); + count += 1; + } + } + // check that at least one value was written + count != 0 + }, + Value::Object(object) => { + let mut buffer = String::new(); + let mut count = 0; + for (key, value) in object { + buffer.clear(); + let _ = write!(&mut buffer, "{}: ", key); + if inner(value, &mut buffer) { + buffer.push_str(". "); + // We write the "key: value. " pair only when + // we are sure that the value can be written. + output.push_str(&buffer); + count += 1; + } + } + // check that at least one value was written + count != 0 + }, + } + } + + let mut string = String::new(); + if inner(value, &mut string) { + Some(string) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn json_to_string_object() { + let value = json!({ + "name": "John Doe", + "age": 43, + "not_there": null, + }); + + let string = json_to_string(value).unwrap(); + assert_eq!(string, "name: John Doe. age: 43. "); + } + + #[test] + fn json_to_string_array() { + let value = json!([ + { "name": "John Doe" }, + 43, + "hello", + [ "I", "am", "fine" ], + null, + ]); + + let string = json_to_string(value).unwrap(); + // We don't care about having two point (.) after the other as + // the distance of hard separators is clamped to 8 anyway. + assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); + } +} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 82582dac8..5259ec662 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -901,4 +901,41 @@ mod tests { assert_eq!(count, 1); drop(rtxn); } + + #[test] + fn complex_json_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with an id for only one of them. + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that there is 1 documents now. + let rtxn = index.read_txn().unwrap(); + + // Search for a sub object value + let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![0]); + + // Search for a sub array value + let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![1]); + + // Search for a sub array sub object key + let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![2]); + + drop(rtxn); + } } diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 43e37d9cd..1503ef5fb 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::{BTreeMap, HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::fs::File; @@ -17,7 +16,7 @@ use tempfile::tempfile; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::tokenizer::{simple_tokenizer, only_token}; -use crate::{SmallVec32, Position, DocumentId}; +use crate::{json_to_string, SmallVec32, Position, DocumentId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge}; @@ -317,25 +316,21 @@ impl Store { } for (attr, content) in document.iter() { - if self.searchable_fields.contains(&attr) { - use serde_json::Value; - let content: Cow = match serde_json::from_slice(content) { - Ok(string) => string, - Err(_) => match serde_json::from_slice(content)? { - Value::Null => continue, - Value::Bool(boolean) => Cow::Owned(boolean.to_string()), - Value::Number(number) => Cow::Owned(number.to_string()), - Value::String(string) => Cow::Owned(string), - Value::Array(_array) => continue, - Value::Object(_object) => continue, - } - }; + if !self.searchable_fields.contains(&attr) { + continue; + } - for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); - } + let value = serde_json::from_slice(content)?; + let content = match json_to_string(value) { + Some(content) => content, + None => continue, + }; + + let tokens = simple_tokenizer(&content).filter_map(only_token); + for (pos, token) in tokens.enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); } }