Make sure we index all kind of JSON types

2025-05-25 09:03:59 +02:00 · 2020-11-06 16:15:07 +01:00 · 2020-11-06 16:15:07 +01:00 · 4fb138c42e
commit 4fb138c42e
parent 640c7d748a
3 changed files with 135 additions and 20 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -58,3 +58,86 @@ pub fn obkv_to_json(
        })
        .collect()
 }
+
+/// Transform a JSON value into a string that can be indexed.
+pub fn json_to_string(value: Value) -> Option<String> {
+
+    fn inner(value: Value, output: &mut String) -> bool {
+        use std::fmt::Write;
+        match value {
+            Value::Null => false,
+            Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
+            Value::Number(number) => write!(output, "{}", number).is_ok(),
+            Value::String(string) => write!(output, "{}", string).is_ok(),
+            Value::Array(array) => {
+                let mut count = 0;
+                for value in array {
+                    if inner(value, output) {
+                        output.push_str(". ");
+                        count += 1;
+                    }
+                }
+                // check that at least one value was written
+                count != 0
+            },
+            Value::Object(object) => {
+                let mut buffer = String::new();
+                let mut count = 0;
+                for (key, value) in object {
+                    buffer.clear();
+                    let _ = write!(&mut buffer, "{}: ", key);
+                    if inner(value, &mut buffer) {
+                        buffer.push_str(". ");
+                        // We write the "key: value. " pair only when
+                        // we are sure that the value can be written.
+                        output.push_str(&buffer);
+                        count += 1;
+                    }
+                }
+                // check that at least one value was written
+                count != 0
+            },
+        }
+    }
+
+    let mut string = String::new();
+    if inner(value, &mut string) {
+        Some(string)
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn json_to_string_object() {
+        let value = json!({
+            "name": "John Doe",
+            "age": 43,
+            "not_there": null,
+        });
+
+        let string = json_to_string(value).unwrap();
+        assert_eq!(string, "name: John Doe. age: 43. ");
+    }
+
+    #[test]
+    fn json_to_string_array() {
+        let value = json!([
+            { "name": "John Doe" },
+            43,
+            "hello",
+            [ "I", "am", "fine" ],
+            null,
+        ]);
+
+        let string = json_to_string(value).unwrap();
+        // We don't care about having two point (.) after the other as
+        // the distance of hard separators is clamped to 8 anyway.
+        assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
+    }
+}
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@ -901,4 +901,41 @@ mod tests {
        assert_eq!(count, 1);
        drop(rtxn);
    }
+
+    #[test]
+    fn complex_json_documents() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with an id for only one of them.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &br#"[
+            { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
+            { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
+            { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
+        ]"#[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is 1 documents now.
+        let rtxn = index.read_txn().unwrap();
+
+        // Search for a sub object value
+        let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![0]);
+
+        // Search for a sub array value
+        let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![1]);
+
+        // Search for a sub array sub object key
+        let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
+        assert_eq!(result.documents_ids, vec![2]);
+
+        drop(rtxn);
+    }
 }
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@ -1,4 +1,3 @@
-use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::convert::{TryFrom, TryInto};
 use std::fs::File;
@ -17,7 +16,7 @@ use tempfile::tempfile;

 use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::tokenizer::{simple_tokenizer, only_token};
-use crate::{SmallVec32, Position, DocumentId};
+use crate::{json_to_string, SmallVec32, Position, DocumentId};

 use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
 use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
@ -317,25 +316,21 @@ impl Store {
                }

                for (attr, content) in document.iter() {
-                    if self.searchable_fields.contains(&attr) {
-                        use serde_json::Value;
-                        let content: Cow<str> = match serde_json::from_slice(content) {
-                            Ok(string) => string,
-                            Err(_) => match serde_json::from_slice(content)? {
-                                Value::Null => continue,
-                                Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
-                                Value::Number(number) => Cow::Owned(number.to_string()),
-                                Value::String(string) => Cow::Owned(string),
-                                Value::Array(_array) => continue,
-                                Value::Object(_object) => continue,
-                            }
-                        };
+                    if !self.searchable_fields.contains(&attr) {
+                        continue;
+                    }

-                        for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
-                            let word = token.to_lowercase();
-                            let position = (attr as usize * MAX_POSITION + pos) as u32;
-                            words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
-                        }
+                    let value = serde_json::from_slice(content)?;
+                    let content = match json_to_string(value) {
+                        Some(content) => content,
+                        None => continue,
+                    };
+
+                    let tokens = simple_tokenizer(&content).filter_map(only_token);
+                    for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
+                        let word = token.to_lowercase();
+                        let position = (attr as usize * MAX_POSITION + pos) as u32;
+                        words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
                    }
                }