Make sure we index all kind of JSON types

This commit is contained in:
Clément Renault 2020-11-06 16:15:07 +01:00
parent 640c7d748a
commit 4fb138c42e
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 135 additions and 20 deletions

View File

@ -58,3 +58,86 @@ pub fn obkv_to_json(
}) })
.collect() .collect()
} }
/// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: Value) -> Option<String> {
fn inner(value: Value, output: &mut String) -> bool {
use std::fmt::Write;
match value {
Value::Null => false,
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
Value::Number(number) => write!(output, "{}", number).is_ok(),
Value::String(string) => write!(output, "{}", string).is_ok(),
Value::Array(array) => {
let mut count = 0;
for value in array {
if inner(value, output) {
output.push_str(". ");
count += 1;
}
}
// check that at least one value was written
count != 0
},
Value::Object(object) => {
let mut buffer = String::new();
let mut count = 0;
for (key, value) in object {
buffer.clear();
let _ = write!(&mut buffer, "{}: ", key);
if inner(value, &mut buffer) {
buffer.push_str(". ");
// We write the "key: value. " pair only when
// we are sure that the value can be written.
output.push_str(&buffer);
count += 1;
}
}
// check that at least one value was written
count != 0
},
}
}
let mut string = String::new();
if inner(value, &mut string) {
Some(string)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn json_to_string_object() {
let value = json!({
"name": "John Doe",
"age": 43,
"not_there": null,
});
let string = json_to_string(value).unwrap();
assert_eq!(string, "name: John Doe. age: 43. ");
}
#[test]
fn json_to_string_array() {
let value = json!([
{ "name": "John Doe" },
43,
"hello",
[ "I", "am", "fine" ],
null,
]);
let string = json_to_string(value).unwrap();
// We don't care about having two point (.) after the other as
// the distance of hard separators is clamped to 8 anyway.
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
}
}

View File

@ -901,4 +901,41 @@ mod tests {
assert_eq!(count, 1); assert_eq!(count, 1);
drop(rtxn); drop(rtxn);
} }
#[test]
fn complex_json_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 1 documents now.
let rtxn = index.read_txn().unwrap();
// Search for a sub object value
let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![0]);
// Search for a sub array value
let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![1]);
// Search for a sub array sub object key
let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![2]);
drop(rtxn);
}
} }

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
use std::fs::File; use std::fs::File;
@ -17,7 +16,7 @@ use tempfile::tempfile;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token}; use crate::tokenizer::{simple_tokenizer, only_token};
use crate::{SmallVec32, Position, DocumentId}; use crate::{json_to_string, SmallVec32, Position, DocumentId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge}; use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
@ -317,25 +316,21 @@ impl Store {
} }
for (attr, content) in document.iter() { for (attr, content) in document.iter() {
if self.searchable_fields.contains(&attr) { if !self.searchable_fields.contains(&attr) {
use serde_json::Value; continue;
let content: Cow<str> = match serde_json::from_slice(content) { }
Ok(string) => string,
Err(_) => match serde_json::from_slice(content)? {
Value::Null => continue,
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
Value::Number(number) => Cow::Owned(number.to_string()),
Value::String(string) => Cow::Owned(string),
Value::Array(_array) => continue,
Value::Object(_object) => continue,
}
};
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { let value = serde_json::from_slice(content)?;
let word = token.to_lowercase(); let content = match json_to_string(value) {
let position = (attr as usize * MAX_POSITION + pos) as u32; Some(content) => content,
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); None => continue,
} };
let tokens = simple_tokenizer(&content).filter_map(only_token);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
} }
} }