mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Merge pull request #28 from meilisearch/highlight-json-value
Make the engine able to highlight any json type
This commit is contained in:
commit
8ffdfa72e3
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -447,7 +447,6 @@ checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
"hashbrown",
|
"hashbrown",
|
||||||
"serde",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -604,7 +603,6 @@ dependencies = [
|
|||||||
"grenad",
|
"grenad",
|
||||||
"heed",
|
"heed",
|
||||||
"human_format",
|
"human_format",
|
||||||
"indexmap",
|
|
||||||
"itertools",
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
|
@ -16,7 +16,6 @@ fxhash = "0.2.1"
|
|||||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
||||||
heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
human_format = "1.0.3"
|
human_format = "1.0.3"
|
||||||
indexmap = { version = "1.6.0", features = ["serde-1"] }
|
|
||||||
jemallocator = "0.3.2"
|
jemallocator = "0.3.2"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
linked-hash-map = "0.5.3"
|
linked-hash-map = "0.5.3"
|
||||||
|
1
http-ui/Cargo.lock
generated
1
http-ui/Cargo.lock
generated
@ -731,7 +731,6 @@ dependencies = [
|
|||||||
"futures",
|
"futures",
|
||||||
"grenad",
|
"grenad",
|
||||||
"heed",
|
"heed",
|
||||||
"indexmap",
|
|
||||||
"log",
|
"log",
|
||||||
"memmap",
|
"memmap",
|
||||||
"milli",
|
"milli",
|
||||||
|
@ -9,7 +9,6 @@ edition = "2018"
|
|||||||
anyhow = "1.0.28"
|
anyhow = "1.0.28"
|
||||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
|
||||||
heed = "0.10.1"
|
heed = "0.10.1"
|
||||||
indexmap = "1.6.0"
|
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
milli = { path = ".." }
|
milli = { path = ".." }
|
||||||
once_cell = "1.4.1"
|
once_cell = "1.4.1"
|
||||||
|
@ -14,10 +14,10 @@ use futures::stream;
|
|||||||
use futures::{FutureExt, StreamExt};
|
use futures::{FutureExt, StreamExt};
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use indexmap::IndexMap;
|
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
use serde::{Serialize, Deserialize, Deserializer};
|
use serde::{Serialize, Deserialize, Deserializer};
|
||||||
|
use serde_json::{Map, Value};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use tokio::fs::File as TFile;
|
use tokio::fs::File as TFile;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
@ -27,7 +27,7 @@ use warp::{Filter, http::Response};
|
|||||||
|
|
||||||
use milli::tokenizer::{simple_tokenizer, TokenType};
|
use milli::tokenizer::{simple_tokenizer, TokenType};
|
||||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||||
use milli::{Index, UpdateStore, SearchResult};
|
use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
|
||||||
|
|
||||||
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
|
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
|
||||||
|
|
||||||
@ -117,20 +117,50 @@ pub struct IndexerOpt {
|
|||||||
pub indexing_jobs: Option<usize>,
|
pub indexing_jobs: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn highlight_record(record: &mut IndexMap<String, String>, words: &HashSet<String>) {
|
fn highlight_record(
|
||||||
for (_key, value) in record.iter_mut() {
|
object: &mut Map<String, Value>,
|
||||||
let old_value = mem::take(value);
|
words_to_highlight: &HashSet<String>,
|
||||||
for (token_type, token) in simple_tokenizer(&old_value) {
|
attributes_to_highlight: &HashSet<String>,
|
||||||
|
) {
|
||||||
|
// TODO do we need to create a string for element that are not and needs to be highlight?
|
||||||
|
fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value {
|
||||||
|
match value {
|
||||||
|
Value::Null => Value::Null,
|
||||||
|
Value::Bool(boolean) => Value::Bool(boolean),
|
||||||
|
Value::Number(number) => Value::Number(number),
|
||||||
|
Value::String(old_string) => {
|
||||||
|
let mut string = String::new();
|
||||||
|
for (token_type, token) in simple_tokenizer(&old_string) {
|
||||||
if token_type == TokenType::Word {
|
if token_type == TokenType::Word {
|
||||||
let lowercase_token = token.to_lowercase();
|
let lowercase_token = token.to_lowercase();
|
||||||
let to_highlight = words.contains(&lowercase_token);
|
let to_highlight = words_to_highlight.contains(&lowercase_token);
|
||||||
if to_highlight { value.push_str("<mark>") }
|
if to_highlight { string.push_str("<mark>") }
|
||||||
value.push_str(token);
|
string.push_str(token);
|
||||||
if to_highlight { value.push_str("</mark>") }
|
if to_highlight { string.push_str("</mark>") }
|
||||||
} else {
|
} else {
|
||||||
value.push_str(token);
|
string.push_str(token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Value::String(string)
|
||||||
|
},
|
||||||
|
Value::Array(values) => {
|
||||||
|
Value::Array(values.into_iter()
|
||||||
|
.map(|v| highlight_value(v, words_to_highlight))
|
||||||
|
.collect())
|
||||||
|
},
|
||||||
|
Value::Object(object) => {
|
||||||
|
Value::Object(object.into_iter()
|
||||||
|
.map(|(k, v)| (k, highlight_value(v, words_to_highlight)))
|
||||||
|
.collect())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (key, value) in object.iter_mut() {
|
||||||
|
if attributes_to_highlight.contains(key) {
|
||||||
|
let old_value = mem::take(value);
|
||||||
|
*value = highlight_value(old_value, words_to_highlight);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -517,23 +547,18 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Some(fields) => Cow::Borrowed(fields),
|
Some(fields) => Cow::Borrowed(fields),
|
||||||
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()),
|
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()),
|
||||||
};
|
};
|
||||||
|
let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() {
|
||||||
|
Some(fields) => fields.iter().flat_map(|id| fields_ids_map.name(*id)).map(ToOwned::to_owned).collect(),
|
||||||
|
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
|
||||||
|
};
|
||||||
|
|
||||||
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
|
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||||
let mut record = displayed_fields.iter()
|
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
||||||
.flat_map(|&id| record.get(id).map(|val| (id, val)))
|
|
||||||
.map(|(key_id, value)| {
|
|
||||||
let key = fields_ids_map.name(key_id).unwrap().to_owned();
|
|
||||||
// TODO we must deserialize a Json Value and highlight it.
|
|
||||||
let value = serde_json::from_slice(value).unwrap();
|
|
||||||
(key, value)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if !disable_highlighting {
|
if !disable_highlighting {
|
||||||
highlight_record(&mut record, &found_words);
|
highlight_record(&mut object, &found_words, &attributes_to_highlight);
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.push(record);
|
documents.push(object);
|
||||||
}
|
}
|
||||||
|
|
||||||
Response::builder()
|
Response::builder()
|
||||||
|
103
src/lib.rs
103
src/lib.rs
@ -14,7 +14,9 @@ use std::borrow::Cow;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
use fxhash::{FxHasher32, FxHasher64};
|
use fxhash::{FxHasher32, FxHasher64};
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
|
||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::fields_ids_map::FieldsIdsMap;
|
pub use self::fields_ids_map::FieldsIdsMap;
|
||||||
@ -38,3 +40,104 @@ pub type Attribute = u32;
|
|||||||
pub type Position = u32;
|
pub type Position = u32;
|
||||||
|
|
||||||
type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;
|
type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;
|
||||||
|
|
||||||
|
/// Transform a raw obkv store into a JSON Object.
|
||||||
|
pub fn obkv_to_json(
|
||||||
|
displayed_fields: &[u8],
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
obkv: obkv::KvReader,
|
||||||
|
) -> anyhow::Result<Map<String, Value>>
|
||||||
|
{
|
||||||
|
displayed_fields.iter()
|
||||||
|
.copied()
|
||||||
|
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||||
|
.map(|(id, value)| {
|
||||||
|
let name = fields_ids_map.name(id).context("unknown obkv field id")?;
|
||||||
|
let value = serde_json::from_slice(value)?;
|
||||||
|
Ok((name.to_owned(), value))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
|
pub fn json_to_string(value: Value) -> Option<String> {
|
||||||
|
|
||||||
|
fn inner(value: Value, output: &mut String) -> bool {
|
||||||
|
use std::fmt::Write;
|
||||||
|
match value {
|
||||||
|
Value::Null => false,
|
||||||
|
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||||
|
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||||
|
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||||
|
Value::Array(array) => {
|
||||||
|
let mut count = 0;
|
||||||
|
for value in array {
|
||||||
|
if inner(value, output) {
|
||||||
|
output.push_str(". ");
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check that at least one value was written
|
||||||
|
count != 0
|
||||||
|
},
|
||||||
|
Value::Object(object) => {
|
||||||
|
let mut buffer = String::new();
|
||||||
|
let mut count = 0;
|
||||||
|
for (key, value) in object {
|
||||||
|
buffer.clear();
|
||||||
|
let _ = write!(&mut buffer, "{}: ", key);
|
||||||
|
if inner(value, &mut buffer) {
|
||||||
|
buffer.push_str(". ");
|
||||||
|
// We write the "key: value. " pair only when
|
||||||
|
// we are sure that the value can be written.
|
||||||
|
output.push_str(&buffer);
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check that at least one value was written
|
||||||
|
count != 0
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut string = String::new();
|
||||||
|
if inner(value, &mut string) {
|
||||||
|
Some(string)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn json_to_string_object() {
|
||||||
|
let value = json!({
|
||||||
|
"name": "John Doe",
|
||||||
|
"age": 43,
|
||||||
|
"not_there": null,
|
||||||
|
});
|
||||||
|
|
||||||
|
let string = json_to_string(value).unwrap();
|
||||||
|
assert_eq!(string, "name: John Doe. age: 43. ");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn json_to_string_array() {
|
||||||
|
let value = json!([
|
||||||
|
{ "name": "John Doe" },
|
||||||
|
43,
|
||||||
|
"hello",
|
||||||
|
[ "I", "am", "fine" ],
|
||||||
|
null,
|
||||||
|
]);
|
||||||
|
|
||||||
|
let string = json_to_string(value).unwrap();
|
||||||
|
// We don't care about having two point (.) after the other as
|
||||||
|
// the distance of hard separators is clamped to 8 anyway.
|
||||||
|
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -901,4 +901,41 @@ mod tests {
|
|||||||
assert_eq!(count, 1);
|
assert_eq!(count, 1);
|
||||||
drop(rtxn);
|
drop(rtxn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn complex_json_documents() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// First we send 3 documents with an id for only one of them.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = &br#"[
|
||||||
|
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||||
|
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||||
|
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||||
|
]"#[..];
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||||
|
builder.update_format(UpdateFormat::Json);
|
||||||
|
builder.execute(content, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Check that there is 1 documents now.
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// Search for a sub object value
|
||||||
|
let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids, vec![0]);
|
||||||
|
|
||||||
|
// Search for a sub array value
|
||||||
|
let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids, vec![1]);
|
||||||
|
|
||||||
|
// Search for a sub array sub object key
|
||||||
|
let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids, vec![2]);
|
||||||
|
|
||||||
|
drop(rtxn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||||
use std::convert::{TryFrom, TryInto};
|
use std::convert::{TryFrom, TryInto};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@ -17,7 +16,7 @@ use tempfile::tempfile;
|
|||||||
|
|
||||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||||
use crate::{SmallVec32, Position, DocumentId};
|
use crate::{json_to_string, SmallVec32, Position, DocumentId};
|
||||||
|
|
||||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||||
use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
|
use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
|
||||||
@ -317,27 +316,23 @@ impl Store {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (attr, content) in document.iter() {
|
for (attr, content) in document.iter() {
|
||||||
if self.searchable_fields.contains(&attr) {
|
if !self.searchable_fields.contains(&attr) {
|
||||||
use serde_json::Value;
|
continue;
|
||||||
let content: Cow<str> = match serde_json::from_slice(content) {
|
|
||||||
Ok(string) => string,
|
|
||||||
Err(_) => match serde_json::from_slice(content)? {
|
|
||||||
Value::Null => continue,
|
|
||||||
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
|
|
||||||
Value::Number(number) => Cow::Owned(number.to_string()),
|
|
||||||
Value::String(string) => Cow::Owned(string),
|
|
||||||
Value::Array(_array) => continue,
|
|
||||||
Value::Object(_object) => continue,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let value = serde_json::from_slice(content)?;
|
||||||
|
let content = match json_to_string(value) {
|
||||||
|
Some(content) => content,
|
||||||
|
None => continue,
|
||||||
};
|
};
|
||||||
|
|
||||||
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
let tokens = simple_tokenizer(&content).filter_map(only_token);
|
||||||
|
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
|
||||||
let word = token.to_lowercase();
|
let word = token.to_lowercase();
|
||||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// We write the document in the documents store.
|
// We write the document in the documents store.
|
||||||
self.write_document(document_id, &words_positions, value)?;
|
self.write_document(document_id, &words_positions, value)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user