3571: Introduce two filters to select documents with `null` and empty fields r=irevoire a=Kerollmops

# Pull Request

## Related issue
This PR implements the `X IS NULL`, `X IS NOT NULL`, `X IS EMPTY`, `X IS NOT EMPTY` filters that [this comment](https://github.com/meilisearch/product/discussions/539#discussioncomment-5115884) is describing in a very detailed manner.

## What does this PR do?

### `IS NULL` and `IS NOT NULL`

This PR will be exposed as a prototype for now. Below is the copy/pasted version of a spec that defines this filter.

- `IS NULL` matches fields that `EXISTS` AND `= IS NULL`
- `IS NOT NULL` matches fields that `NOT EXISTS` OR `!= IS NULL`

1. `{"name": "A", "price": null}`
2. `{"name": "A", "price": 10}`
3. `{"name": "A"}`

`price IS NULL` would match 1
`price IS NOT NULL` or `NOT price IS NULL` would match 2,3
`price EXISTS` would match 1, 2
`price NOT EXISTS` or `NOT price EXISTS` would match 3

common query : `(price EXISTS) AND (price IS NOT NULL)` would match 2

### `IS EMPTY` and `IS NOT EMPTY`

- `IS EMPTY` matches Array `[]`, Object `{}`, or String `""` fields that `EXISTS` and are empty
- `IS NOT EMPTY` matches fields that `NOT EXISTS` OR are not empty.

1. `{"name": "A", "tags": null}`
2. `{"name": "A", "tags": [null]}`
3. `{"name": "A", "tags": []}`
4. `{"name": "A", "tags": ["hello","world"]}`
5. `{"name": "A", "tags": [""]}`
6. `{"name": "A"}`
7. `{"name": "A", "tags": {}}`
8. `{"name": "A", "tags": {"t1":"v1"}}`
9. `{"name": "A", "tags": {"t1":""}}`
10. `{"name": "A", "tags": ""}`

`tags IS EMPTY` would match 3,7,10
`tags IS NOT EMPTY` or `NOT tags IS EMPTY` would match 1,2,4,5,6,8,9
`tags IS NULL` would match 1
`tags IS NOT NULL` or `NOT tags IS NULL` would match 2,3,4,5,6,7,8,9,10
`tags EXISTS` would match 1,2,3,4,5,7,8,9,10
`tags NOT EXISTS` or `NOT tags EXISTS` would match 6

common query : `(tags EXISTS) AND (tags IS NOT NULL) AND (tags IS NOT EMPTY)` would match 2,4,5,8,9

## What should the reviewer do?

- Check that I tested the filters
- Check that I deleted the ids of the documents when deleting documents


Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2023-04-27 13:14:00 +00:00 committed by GitHub
commit 414b3fae89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 730 additions and 118 deletions

View File

@ -20,6 +20,8 @@ pub enum Condition<'a> {
GreaterThanOrEqual(Token<'a>), GreaterThanOrEqual(Token<'a>),
Equal(Token<'a>), Equal(Token<'a>),
NotEqual(Token<'a>), NotEqual(Token<'a>),
Null,
Empty,
Exists, Exists,
LowerThan(Token<'a>), LowerThan(Token<'a>),
LowerThanOrEqual(Token<'a>), LowerThanOrEqual(Token<'a>),
@ -44,6 +46,38 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
Ok((input, condition)) Ok((input, condition))
} }
/// null = value "IS" WS+ "NULL"
pub fn parse_is_null(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("NULL")))(input)?;
Ok((input, FilterCondition::Condition { fid: key, op: Null }))
}
/// null = value "IS" WS+ "NOT" WS+ "NULL"
pub fn parse_is_not_null(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("NULL")))(input)?;
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null }))))
}
/// empty = value "IS" WS+ "EMPTY"
pub fn parse_is_empty(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("EMPTY")))(input)?;
Ok((input, FilterCondition::Condition { fid: key, op: Empty }))
}
/// empty = value "IS" WS+ "NOT" WS+ "EMPTY"
pub fn parse_is_not_empty(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("EMPTY")))(input)?;
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Empty }))))
}
/// exist = value "EXISTS" /// exist = value "EXISTS"
pub fn parse_exists(input: Span) -> IResult<FilterCondition> { pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;

View File

@ -143,11 +143,9 @@ impl<'a> Display for Error<'a> {
ErrorKind::MissingClosingDelimiter(c) => { ErrorKind::MissingClosingDelimiter(c) => {
writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)?
} }
ErrorKind::InvalidPrimary if input.trim().is_empty() => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")?
}
ErrorKind::InvalidPrimary => { ErrorKind::InvalidPrimary => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
} }
ErrorKind::ExpectedEof => { ErrorKind::ExpectedEof => {
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?

View File

@ -47,7 +47,10 @@ mod value;
use std::fmt::Debug; use std::fmt::Debug;
pub use condition::{parse_condition, parse_to, Condition}; pub use condition::{parse_condition, parse_to, Condition};
use condition::{parse_exists, parse_not_exists}; use condition::{
parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, parse_is_null,
parse_not_exists,
};
use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt};
pub use error::{Error, ErrorKind}; pub use error::{Error, ErrorKind};
use nom::branch::alt; use nom::branch::alt;
@ -442,6 +445,10 @@ fn parse_primary(input: Span, depth: usize) -> IResult<FilterCondition> {
parse_in, parse_in,
parse_not_in, parse_not_in,
parse_condition, parse_condition,
parse_is_null,
parse_is_not_null,
parse_is_empty,
parse_is_not_empty,
parse_exists, parse_exists,
parse_not_exists, parse_not_exists,
parse_to, parse_to,
@ -526,14 +533,30 @@ pub mod tests {
insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}");
insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}");
// Test NOT + EXISTS // Test NOT
insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS");
insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})");
insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})");
// Test NULL + NOT NULL
insta::assert_display_snapshot!(p("subscribers IS NULL"), @"{subscribers} IS NULL");
insta::assert_display_snapshot!(p("NOT subscribers IS NULL"), @"NOT ({subscribers} IS NULL)");
insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)");
insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL");
insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)");
// Test EMPTY + NOT EMPTY
insta::assert_display_snapshot!(p("subscribers IS EMPTY"), @"{subscribers} IS EMPTY");
insta::assert_display_snapshot!(p("NOT subscribers IS EMPTY"), @"NOT ({subscribers} IS EMPTY)");
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
insta::assert_display_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY");
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
// Test EXISTS + NOT EXITS
insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS");
insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)");
insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)");
insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS");
insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)");
insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})");
// Test nested NOT // Test nested NOT
insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}");
@ -606,7 +629,7 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("'OR'"), @r###" insta::assert_display_snapshot!(p("'OR'"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`.
1:5 'OR' 1:5 'OR'
"###); "###);
@ -616,12 +639,12 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("channel Ponce"), @r###" insta::assert_display_snapshot!(p("channel Ponce"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`.
1:14 channel Ponce 1:14 channel Ponce
"###); "###);
insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.
19:19 channel = Ponce OR 19:19 channel = Ponce OR
"###); "###);
@ -706,12 +729,12 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`.
1:17 colour NOT EXIST 1:17 colour NOT EXIST
"###); "###);
insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`.
1:23 subscribers 100 TO1000 1:23 subscribers 100 TO1000
"###); "###);
@ -772,6 +795,39 @@ pub mod tests {
Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes.
5:7 NOT OR EXISTS AND EXISTS NOT EXISTS 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS
"###); "###);
insta::assert_display_snapshot!(p(r#"value NULL"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`.
1:11 value NULL
"###);
insta::assert_display_snapshot!(p(r#"value NOT NULL"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`.
1:15 value NOT NULL
"###);
insta::assert_display_snapshot!(p(r#"value EMPTY"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`.
1:12 value EMPTY
"###);
insta::assert_display_snapshot!(p(r#"value NOT EMPTY"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`.
1:16 value NOT EMPTY
"###);
insta::assert_display_snapshot!(p(r#"value IS"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS`.
1:9 value IS
"###);
insta::assert_display_snapshot!(p(r#"value IS NOT"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`.
1:13 value IS NOT
"###);
insta::assert_display_snapshot!(p(r#"value IS EXISTS"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`.
1:16 value IS EXISTS
"###);
insta::assert_display_snapshot!(p(r#"value IS NOT EXISTS"#), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`.
1:20 value IS NOT EXISTS
"###);
} }
#[test] #[test]
@ -853,6 +909,8 @@ impl<'a> std::fmt::Display for Condition<'a> {
Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"),
Condition::Equal(token) => write!(f, "= {token}"), Condition::Equal(token) => write!(f, "= {token}"),
Condition::NotEqual(token) => write!(f, "!= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"),
Condition::Null => write!(f, "IS NULL"),
Condition::Empty => write!(f, "IS EMPTY"),
Condition::Exists => write!(f, "EXISTS"), Condition::Exists => write!(f, "EXISTS"),
Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThan(token) => write!(f, "< {token}"),
Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"),

View File

@ -183,7 +183,20 @@ fn is_syntax_component(c: char) -> bool {
} }
fn is_keyword(s: &str) -> bool { fn is_keyword(s: &str) -> bool {
matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius" | "_geoBoundingBox") matches!(
s,
"AND"
| "OR"
| "IN"
| "NOT"
| "TO"
| "EXISTS"
| "IS"
| "NULL"
| "EMPTY"
| "_geoRadius"
| "_geoBoundingBox"
)
} }
#[cfg(test)] #[cfg(test)]

View File

@ -4,51 +4,56 @@ use serde_json::{Map, Value};
pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> { pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
let mut obj = Map::new(); let mut obj = Map::new();
let mut all_keys = vec![]; let mut all_entries = vec![];
insert_object(&mut obj, None, json, &mut all_keys); insert_object(&mut obj, None, json, &mut all_entries);
for key in all_keys { for (key, old_val) in all_entries {
obj.entry(key).or_insert(Value::Array(vec![])); obj.entry(key).or_insert(old_val.clone());
} }
obj obj
} }
fn insert_object( fn insert_object<'a>(
base_json: &mut Map<String, Value>, base_json: &mut Map<String, Value>,
base_key: Option<&str>, base_key: Option<&str>,
object: &Map<String, Value>, object: &'a Map<String, Value>,
all_keys: &mut Vec<String>, all_entries: &mut Vec<(String, &'a Value)>,
) { ) {
for (key, value) in object { for (key, value) in object {
let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
all_keys.push(new_key.clone()); all_entries.push((new_key.clone(), value));
if let Some(array) = value.as_array() { if let Some(array) = value.as_array() {
insert_array(base_json, &new_key, array, all_keys); insert_array(base_json, &new_key, array, all_entries);
} else if let Some(object) = value.as_object() { } else if let Some(object) = value.as_object() {
insert_object(base_json, Some(&new_key), object, all_keys); insert_object(base_json, Some(&new_key), object, all_entries);
} else { } else {
insert_value(base_json, &new_key, value.clone()); insert_value(base_json, &new_key, value.clone(), false);
} }
} }
} }
fn insert_array( fn insert_array<'a>(
base_json: &mut Map<String, Value>, base_json: &mut Map<String, Value>,
base_key: &str, base_key: &str,
array: &Vec<Value>, array: &'a Vec<Value>,
all_keys: &mut Vec<String>, all_entries: &mut Vec<(String, &'a Value)>,
) { ) {
for value in array { for value in array {
if let Some(object) = value.as_object() { if let Some(object) = value.as_object() {
insert_object(base_json, Some(base_key), object, all_keys); insert_object(base_json, Some(base_key), object, all_entries);
} else if let Some(sub_array) = value.as_array() { } else if let Some(sub_array) = value.as_array() {
insert_array(base_json, base_key, sub_array, all_keys); insert_array(base_json, base_key, sub_array, all_entries);
} else { } else {
insert_value(base_json, base_key, value.clone()); insert_value(base_json, base_key, value.clone(), true);
} }
} }
} }
fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value) { fn insert_value(
base_json: &mut Map<String, Value>,
key: &str,
to_insert: Value,
came_from_array: bool,
) {
debug_assert!(!to_insert.is_object()); debug_assert!(!to_insert.is_object());
debug_assert!(!to_insert.is_array()); debug_assert!(!to_insert.is_array());
@ -63,6 +68,8 @@ fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value)
base_json[key] = Value::Array(vec![value, to_insert]); base_json[key] = Value::Array(vec![value, to_insert]);
} }
// if it does not exist we can push the value untouched // if it does not exist we can push the value untouched
} else if came_from_array {
base_json.insert(key.to_string(), Value::Array(vec![to_insert]));
} else { } else {
base_json.insert(key.to_string(), to_insert); base_json.insert(key.to_string(), to_insert);
} }
@ -113,7 +120,11 @@ mod tests {
assert_eq!( assert_eq!(
&flat, &flat,
json!({ json!({
"a": [], "a": {
"b": "c",
"d": "e",
"f": "g"
},
"a.b": "c", "a.b": "c",
"a.d": "e", "a.d": "e",
"a.f": "g" "a.f": "g"
@ -164,7 +175,7 @@ mod tests {
assert_eq!( assert_eq!(
&flat, &flat,
json!({ json!({
"a": 42, "a": [42],
"a.b": ["c", "d", "e"], "a.b": ["c", "d", "e"],
}) })
.as_object() .as_object()
@ -186,7 +197,7 @@ mod tests {
assert_eq!( assert_eq!(
&flat, &flat,
json!({ json!({
"a": null, "a": [null],
"a.b": ["c", "d", "e"], "a.b": ["c", "d", "e"],
}) })
.as_object() .as_object()
@ -208,7 +219,9 @@ mod tests {
assert_eq!( assert_eq!(
&flat, &flat,
json!({ json!({
"a": [], "a": {
"b": "c"
},
"a.b": ["c", "d"], "a.b": ["c", "d"],
}) })
.as_object() .as_object()
@ -234,7 +247,7 @@ mod tests {
json!({ json!({
"a.b": ["c", "d", "f"], "a.b": ["c", "d", "f"],
"a.c": "e", "a.c": "e",
"a": 35, "a": [35],
}) })
.as_object() .as_object()
.unwrap() .unwrap()
@ -302,4 +315,53 @@ mod tests {
.unwrap() .unwrap()
); );
} }
#[test]
fn flatten_nested_values_keep_original_values() {
let mut base: Value = json!({
"tags": {
"t1": "v1"
},
"prices": {
"p1": [null],
"p1000": {"tamo": {"le": {}}}
},
"kiki": [[]]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
println!("{}", serde_json::to_string_pretty(&flat).unwrap());
assert_eq!(
&flat,
json!({
"prices": {
"p1": [null],
"p1000": {
"tamo": {
"le": {}
}
}
},
"prices.p1": [null],
"prices.p1000": {
"tamo": {
"le": {}
}
},
"prices.p1000.tamo": {
"le": {}
},
"prices.p1000.tamo.le": {},
"tags": {
"t1": "v1"
},
"tags.t1": "v1",
"kiki": [[]]
})
.as_object()
.unwrap()
);
}
} }

View File

@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() {
index.wait_task(1).await; index.wait_task(1).await;
let expected_response = json!({ let expected_response = json!({
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
"code": "invalid_search_filter", "code": "invalid_search_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() {
index.wait_task(1).await; index.wait_task(1).await;
let expected_response = json!({ let expected_response = json!({
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
"code": "invalid_search_filter", "code": "invalid_search_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_search_filter"

View File

@ -80,6 +80,8 @@ pub mod db_name {
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -129,6 +131,10 @@ pub struct Index {
/// Maps the facet field id and the docids for which this field exists /// Maps the facet field id and the docids for which this field exists
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is set as null
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is considered empty
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them. /// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
@ -153,7 +159,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(19); options.max_dbs(21);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -175,6 +181,8 @@ impl Index {
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = let field_id_docid_facet_strings =
@ -201,6 +209,8 @@ impl Index {
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
documents, documents,
@ -833,6 +843,30 @@ impl Index {
} }
} }
/// Retrieve all the documents which contain this field id set as null
pub fn null_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id and that is considered empty
pub fn empty_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id /// Retrieve all the documents which contain this field id
pub fn exists_faceted_documents_ids( pub fn exists_faceted_documents_ids(
&self, &self,

View File

@ -211,6 +211,14 @@ impl<'a> Filter<'a> {
Condition::Between { from, to } => { Condition::Between { from, to } => {
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
} }
Condition::Null => {
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_null);
}
Condition::Empty => {
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_empty);
}
Condition::Exists => { Condition::Exists => {
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
return Ok(exist); return Ok(exist);

View File

@ -271,6 +271,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String {
&format!("{facet_id:<3} {}", display_bitmap(&docids)) &format!("{facet_id:<3} {}", display_bitmap(&docids))
}) })
} }
pub fn snap_facet_id_is_null_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| {
&format!("{facet_id:<3} {}", display_bitmap(&docids))
})
}
pub fn snap_facet_id_is_empty_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| {
&format!("{facet_id:<3} {}", display_bitmap(&docids))
})
}
pub fn snap_facet_id_string_docids(index: &Index) -> String { pub fn snap_facet_id_string_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_string_docids, |( make_db_snap_from_iter!(index, facet_id_string_docids, |(
FacetGroupKey { field_id, level, left_bound }, FacetGroupKey { field_id, level, left_bound },
@ -495,6 +505,12 @@ macro_rules! full_snap_of_db {
($index:ident, facet_id_exists_docids) => {{ ($index:ident, facet_id_exists_docids) => {{
$crate::snapshot_tests::snap_facet_id_exists_docids(&$index) $crate::snapshot_tests::snap_facet_id_exists_docids(&$index)
}}; }};
($index:ident, facet_id_is_null_docids) => {{
$crate::snapshot_tests::snap_facet_id_is_null_docids(&$index)
}};
($index:ident, facet_id_is_empty_docids) => {{
$crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index)
}};
($index:ident, documents_ids) => {{ ($index:ident, documents_ids) => {{
$crate::snapshot_tests::snap_documents_ids(&$index) $crate::snapshot_tests::snap_documents_ids(&$index)
}}; }};

View File

@ -34,6 +34,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
documents, documents,
@ -86,6 +88,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids.clear(self.wtxn)?; script_language_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?;
facet_id_is_empty_docids.clear(self.wtxn)?;
facet_id_string_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;

View File

@ -245,6 +245,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
field_id_docid_facet_strings: _, field_id_docid_facet_strings: _,
script_language_docids, script_language_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
documents, documents,
} = self.index; } = self.index;
@ -517,12 +519,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// We delete the documents ids that are under the facet field id values. // We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_exists_docids( remove_docids_from_facet_id_docids(
self.wtxn, self.wtxn,
facet_id_exists_docids, facet_id_exists_docids,
&self.to_delete_docids, &self.to_delete_docids,
)?; )?;
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_is_null_docids,
&self.to_delete_docids,
)?;
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_is_empty_docids,
&self.to_delete_docids,
)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
Ok(DetailedDocumentDeletionResult { Ok(DetailedDocumentDeletionResult {
@ -625,7 +641,7 @@ fn remove_docids_from_field_id_docid_facet_value(
Ok(all_affected_facet_values) Ok(all_affected_facet_values)
} }
fn remove_docids_from_facet_id_exists_docids<'a, C>( fn remove_docids_from_facet_id_docids<'a, C>(
wtxn: &'a mut heed::RwTxn, wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, CboRoaringBitmapCodec>, db: &heed::Database<C, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,

View File

@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
fn inner(value: &Value, output: &mut String) -> bool { fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write; use std::fmt::Write;
match value { match value {
Value::Null => false, Value::Null | Value::Object(_) => false,
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
Value::Number(number) => write!(output, "{}", number).is_ok(), Value::Number(number) => write!(output, "{}", number).is_ok(),
Value::String(string) => write!(output, "{}", string).is_ok(), Value::String(string) => write!(output, "{}", string).is_ok(),
@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
// check that at least one value was written // check that at least one value was written
count != 0 count != 0
} }
Value::Object(object) => {
let mut buffer = String::new();
let mut count = 0;
for (key, value) in object {
buffer.clear();
let _ = write!(&mut buffer, "{}: ", key);
if inner(value, &mut buffer) {
buffer.push_str(". ");
// We write the "key: value. " pair only when
// we are sure that the value can be written.
output.push_str(&buffer);
count += 1;
}
}
// check that at least one value was written
count != 0
}
} }
} }

View File

@ -7,7 +7,7 @@ use std::mem::size_of;
use heed::zerocopy::AsBytes; use heed::zerocopy::AsBytes;
use heed::BytesEncode; use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::{from_slice, Value};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError; use crate::error::InternalError;
@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes;
use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
/// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
}
/// Extracts the facet values of each faceted field of each document. /// Extracts the facet values of each faceted field of each document.
/// ///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// Returns the generated grenad reader containing the docid the fid and the orginal value as key
@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>, faceted_fields: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> { ) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter( let mut fid_docid_facet_numbers_sorter = create_sorter(
@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
); );
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
// For the other extraction tasks, prefix the key with the field_id and the document_id // For the other extraction tasks, prefix the key with the field_id and the document_id
key_buffer.extend_from_slice(docid_bytes); key_buffer.extend_from_slice(docid_bytes);
let value = let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
let (numbers, strings) = extract_facet_values(&value); match extract_facet_values(&value) {
FilterableValues::Null => {
// insert facet numbers in sorter facet_is_null_docids.entry(field_id).or_default().insert(document);
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
} }
} FilterableValues::Empty => {
facet_is_empty_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// insert normalized and original facet string in sorter fid_docid_facet_numbers_sorter
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { .insert(&key_buffer, ().as_bytes())?;
let normalised_truncated_value: String = normalized }
.char_indices() }
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); // insert normalized and original facet string in sorter
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); for (normalized, original) in
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; strings.into_iter().filter(|(n, _)| !n.is_empty())
{
let normalized_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
fid_docid_facet_strings_sorter
.insert(&key_buffer, original.as_bytes())?;
}
}
} }
} }
} }
@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
} }
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
Ok(( let mut facet_is_null_docids_writer = create_writer(
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, indexer.chunk_compression_type,
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, indexer.chunk_compression_level,
facet_exists_docids_reader, tempfile::tempfile()?,
)) );
for (fid, bitmap) in facet_is_null_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
let mut facet_is_empty_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
Ok(ExtractedFacetValues {
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
})
} }
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { /// Represent what a document field contains.
enum FilterableValues {
/// Corresponds to the JSON `null` value.
Null,
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
Empty,
/// Represents all the numbers and strings values found in this document field.
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
}
fn extract_facet_values(value: &Value) -> FilterableValues {
fn inner_extract_facet_values( fn inner_extract_facet_values(
value: &Value, value: &Value,
can_recurse: bool, can_recurse: bool,
@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
} }
} }
let mut facet_number_values = Vec::new(); match value {
let mut facet_string_values = Vec::new(); Value::Null => FilterableValues::Null,
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); Value::String(s) if s.is_empty() => FilterableValues::Empty,
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
(facet_number_values, facet_string_values) Value::Object(o) if o.is_empty() => FilterableValues::Empty,
otherwise => {
let mut numbers = Vec::new();
let mut strings = Vec::new();
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
FilterableValues::Values { numbers, strings }
}
}
} }

View File

@ -18,7 +18,7 @@ use rayon::prelude::*;
use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_docid_word_positions::extract_docid_word_positions;
use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_number_docids::extract_facet_number_docids;
use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_facet_string_docids::extract_facet_string_docids;
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points; use self::extract_geo_points::extract_geo_points;
use self::extract_word_docids::extract_word_docids; use self::extract_word_docids::extract_word_docids;
@ -55,28 +55,35 @@ pub(crate) fn data_from_obkv_documents(
.collect::<Result<()>>()?; .collect::<Result<()>>()?;
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
.par_bridge() flattened_obkv_chunks
.map(|flattened_obkv_chunks| { .par_bridge()
send_and_extract_flattened_documents_data( .map(|flattened_obkv_chunks| {
flattened_obkv_chunks, send_and_extract_flattened_documents_data(
indexer, flattened_obkv_chunks,
lmdb_writer_sx.clone(), indexer,
&searchable_fields, lmdb_writer_sx.clone(),
&faceted_fields, &searchable_fields,
primary_key_id, &faceted_fields,
geo_fields_ids, primary_key_id,
&stop_words, geo_fields_ids,
max_positions_per_attributes, &stop_words,
) max_positions_per_attributes,
}) )
.collect(); })
.collect();
let ( let (
docid_word_positions_chunks, docid_word_positions_chunks,
( (
docid_fid_facet_numbers_chunks, docid_fid_facet_numbers_chunks,
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks), (
docid_fid_facet_strings_chunks,
(
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
),
),
), ),
) = result?; ) = result?;
@ -96,6 +103,38 @@ pub(crate) fn data_from_obkv_documents(
}); });
} }
// merge facet_is_null_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-null-docids");
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
// merge facet_is_empty_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-empty-docids");
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
@ -235,7 +274,10 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
( (
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>), (
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
),
), ),
)> { )> {
let flattened_documents_chunk = let flattened_documents_chunk =
@ -281,11 +323,13 @@ fn send_and_extract_flattened_documents_data(
Ok(docid_word_positions_chunk) Ok(docid_word_positions_chunk)
}, },
|| { || {
let ( let ExtractedFacetValues {
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk, docid_fid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk, fid_facet_exists_docids_chunk,
) = extract_fid_docid_facet_values( } = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
faceted_fields, faceted_fields,
@ -309,7 +353,13 @@ fn send_and_extract_flattened_documents_data(
Ok(( Ok((
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), (
docid_fid_facet_strings_chunk,
(
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
),
),
)) ))
}, },
); );

View File

@ -1757,6 +1757,187 @@ mod tests {
check_ok(&index); check_ok(&index);
} }
#[test]
fn index_documents_check_is_null_database() {
let content = || {
documents!([
{
"id": 0,
"colour": null,
},
{
"id": 1,
"colour": [null], // must not be returned
},
{
"id": 6,
"colour": {
"green": null
}
},
{
"id": 7,
"colour": {
"green": {
"blue": null
}
}
},
{
"id": 8,
"colour": 0,
},
{
"id": 9,
"colour": []
},
{
"id": 10,
"colour": {}
},
{
"id": 12,
"colour": [1]
},
{
"id": 13
},
{
"id": 14,
"colour": {
"green": 1
}
},
{
"id": 15,
"colour": {
"green": {
"blue": []
}
}
}
])
};
let check_ok = |index: &Index| {
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
let colour_blue_id =
index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
let bitmap_null_colour =
index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
let bitmap_colour_green = index
.facet_id_is_null_docids
.get(&rtxn, &BEU16::new(colour_green_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
let bitmap_colour_blue = index
.facet_id_is_null_docids
.get(&rtxn, &BEU16::new(colour_blue_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
};
let faceted_fields = hashset!(S("colour"));
let index = TempIndex::new();
index.add_documents(content()).unwrap();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
check_ok(&index);
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
index.add_documents(content()).unwrap();
check_ok(&index);
}
#[test]
fn index_documents_check_is_empty_database() {
let content = || {
documents!([
{"id": 0, "tags": null },
{"id": 1, "tags": [null] },
{"id": 2, "tags": [] },
{"id": 3, "tags": ["hello","world"] },
{"id": 4, "tags": [""] },
{"id": 5 },
{"id": 6, "tags": {} },
{"id": 7, "tags": {"green": "cool"} },
{"id": 8, "tags": {"green": ""} },
{"id": 9, "tags": "" },
{"id": 10, "tags": { "green": null } },
{"id": 11, "tags": { "green": { "blue": null } } },
{"id": 12, "tags": { "green": { "blue": [] } } }
])
};
let check_ok = |index: &Index| {
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();
assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
let bitmap_empty_tags =
index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap();
assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
let bitmap_tags_green = index
.facet_id_is_empty_docids
.get(&rtxn, &BEU16::new(tags_green_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
let bitmap_tags_blue = index
.facet_id_is_empty_docids
.get(&rtxn, &BEU16::new(tags_blue_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
};
let faceted_fields = hashset!(S("tags"));
let index = TempIndex::new();
index.add_documents(content()).unwrap();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
check_ok(&index);
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
index.add_documents(content()).unwrap();
check_ok(&index);
}
#[test] #[test]
fn primary_key_must_not_contain_floats() { fn primary_key_must_not_contain_floats() {
let index = TempIndex::new_with_map_size(4096 * 100); let index = TempIndex::new_with_map_size(4096 * 100);

View File

@ -39,6 +39,8 @@ pub(crate) enum TypedChunk {
FieldIdFacetStringDocids(grenad::Reader<File>), FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<File>), FieldIdFacetNumberDocids(grenad::Reader<File>),
FieldIdFacetExistsDocids(grenad::Reader<File>), FieldIdFacetExistsDocids(grenad::Reader<File>),
FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>), GeoPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
} }
@ -161,6 +163,28 @@ pub(crate) fn write_typed_chunk_into_index(
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
append_entries_into_database(
facet_id_is_null_docids,
&index.facet_id_is_null_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
append_entries_into_database(
facet_id_is_empty_docids,
&index.facet_id_is_empty_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
append_entries_into_database( append_entries_into_database(
word_pair_proximity_docids_iter, word_pair_proximity_docids_iter,

View File

@ -82,10 +82,23 @@ test_filter!(
vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]
); );
test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]);
test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]);
test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]);
test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]);
test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]);
test_filter!(null_filter_1, vec![Right("opt1 IS NULL")]);
test_filter!(null_filter_2, vec![Right("opt1.opt2 IS NULL")]);
test_filter!(null_filter_1_not, vec![Right("opt1 IS NOT NULL")]);
test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 IS NULL")]);
test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 IS NOT NULL")]);
test_filter!(empty_filter_1, vec![Right("opt1 IS EMPTY")]);
test_filter!(empty_filter_2, vec![Right("opt1.opt2 IS EMPTY")]);
test_filter!(empty_filter_1_not, vec![Right("opt1 IS NOT EMPTY")]);
test_filter!(empty_filter_1_not_alt, vec![Right("NOT opt1 IS EMPTY")]);
test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]);
test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]);
test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]);
test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]);

View File

@ -205,6 +205,30 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
} else if let Some(opt1) = &document.opt1 { } else if let Some(opt1) = &document.opt1 {
id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); id = contains_key_rec(opt1, "opt2").then(|| document.id.clone());
} }
} else if matches!(filter, "opt1 IS NULL" | "NOT opt1 IS NOT NULL") {
id = document.opt1.as_ref().map_or(false, |v| v.is_null()).then(|| document.id.clone());
} else if matches!(filter, "NOT opt1 IS NULL" | "opt1 IS NOT NULL") {
id = document.opt1.as_ref().map_or(true, |v| !v.is_null()).then(|| document.id.clone());
} else if matches!(filter, "opt1.opt2 IS NULL") {
if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) {
id = Some(document.id.clone());
} else if let Some(opt1) = &document.opt1 {
if !opt1.is_null() {
id = contains_null_rec(opt1, "opt2").then(|| document.id.clone());
}
}
} else if matches!(filter, "opt1 IS EMPTY" | "NOT opt1 IS NOT EMPTY") {
id = document.opt1.as_ref().map_or(false, is_empty_value).then(|| document.id.clone());
} else if matches!(filter, "NOT opt1 IS EMPTY" | "opt1 IS NOT EMPTY") {
id = document
.opt1
.as_ref()
.map_or(true, |v| !is_empty_value(v))
.then(|| document.id.clone());
} else if matches!(filter, "opt1.opt2 IS EMPTY") {
if document.opt1opt2.as_ref().map_or(false, is_empty_value) {
id = Some(document.id.clone());
}
} else if matches!( } else if matches!(
filter, filter,
"tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]" "tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]"
@ -218,6 +242,15 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
id id
} }
pub fn is_empty_value(v: &serde_json::Value) -> bool {
match v {
serde_json::Value::String(s) => s.is_empty(),
serde_json::Value::Array(a) => a.is_empty(),
serde_json::Value::Object(o) => o.is_empty(),
_ => false,
}
}
pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool {
match v { match v {
serde_json::Value::Array(v) => { serde_json::Value::Array(v) => {
@ -240,6 +273,28 @@ pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool {
} }
} }
pub fn contains_null_rec(v: &serde_json::Value, key: &str) -> bool {
match v {
serde_json::Value::Object(v) => {
for (k, v) in v.iter() {
if k == key && v.is_null() || contains_null_rec(v, key) {
return true;
}
}
false
}
serde_json::Value::Array(v) => {
for v in v.iter() {
if contains_null_rec(v, key) {
return true;
}
}
false
}
_ => false,
}
}
pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<String> { pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<String> {
let dataset: Vec<TestDocument> = let dataset: Vec<TestDocument> =
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();