mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-05 04:28:55 +01:00
Make sure we index all kind of JSON types
This commit is contained in:
parent
640c7d748a
commit
4fb138c42e
83
src/lib.rs
83
src/lib.rs
@ -58,3 +58,86 @@ pub fn obkv_to_json(
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
pub fn json_to_string(value: Value) -> Option<String> {
|
||||
|
||||
fn inner(value: Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
},
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut string = String::new();
|
||||
if inner(value, &mut string) {
|
||||
Some(string)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn json_to_string_object() {
|
||||
let value = json!({
|
||||
"name": "John Doe",
|
||||
"age": 43,
|
||||
"not_there": null,
|
||||
});
|
||||
|
||||
let string = json_to_string(value).unwrap();
|
||||
assert_eq!(string, "name: John Doe. age: 43. ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_to_string_array() {
|
||||
let value = json!([
|
||||
{ "name": "John Doe" },
|
||||
43,
|
||||
"hello",
|
||||
[ "I", "am", "fine" ],
|
||||
null,
|
||||
]);
|
||||
|
||||
let string = json_to_string(value).unwrap();
|
||||
// We don't care about having two point (.) after the other as
|
||||
// the distance of hard separators is clamped to 8 anyway.
|
||||
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
||||
}
|
||||
}
|
||||
|
@ -901,4 +901,41 @@ mod tests {
|
||||
assert_eq!(count, 1);
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn complex_json_documents() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = &br#"[
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 1 documents now.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// Search for a sub object value
|
||||
let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
|
||||
assert_eq!(result.documents_ids, vec![0]);
|
||||
|
||||
// Search for a sub array value
|
||||
let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
|
||||
assert_eq!(result.documents_ids, vec![1]);
|
||||
|
||||
// Search for a sub array sub object key
|
||||
let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
|
||||
assert_eq!(result.documents_ids, vec![2]);
|
||||
|
||||
drop(rtxn);
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
@ -17,7 +16,7 @@ use tempfile::tempfile;
|
||||
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||
use crate::{SmallVec32, Position, DocumentId};
|
||||
use crate::{json_to_string, SmallVec32, Position, DocumentId};
|
||||
|
||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||
use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
|
||||
@ -317,25 +316,21 @@ impl Store {
|
||||
}
|
||||
|
||||
for (attr, content) in document.iter() {
|
||||
if self.searchable_fields.contains(&attr) {
|
||||
use serde_json::Value;
|
||||
let content: Cow<str> = match serde_json::from_slice(content) {
|
||||
Ok(string) => string,
|
||||
Err(_) => match serde_json::from_slice(content)? {
|
||||
Value::Null => continue,
|
||||
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
|
||||
Value::Number(number) => Cow::Owned(number.to_string()),
|
||||
Value::String(string) => Cow::Owned(string),
|
||||
Value::Array(_array) => continue,
|
||||
Value::Object(_object) => continue,
|
||||
}
|
||||
};
|
||||
if !self.searchable_fields.contains(&attr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
||||
let word = token.to_lowercase();
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
||||
}
|
||||
let value = serde_json::from_slice(content)?;
|
||||
let content = match json_to_string(value) {
|
||||
Some(content) => content,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let tokens = simple_tokenizer(&content).filter_map(only_token);
|
||||
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
|
||||
let word = token.to_lowercase();
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user