mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-09 13:04:30 +01:00
Merge branch 'filter/field-exist'
This commit is contained in:
commit
07003704a8
1
filter-parser/fuzz/.gitignore
vendored
1
filter-parser/fuzz/.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
/corpus/
|
||||
/artifacts/
|
||||
/target/
|
@ -7,8 +7,9 @@
|
||||
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag;
|
||||
use nom::character::complete::multispace1;
|
||||
use nom::combinator::cut;
|
||||
use nom::sequence::tuple;
|
||||
use nom::sequence::{terminated, tuple};
|
||||
use Condition::*;
|
||||
|
||||
use crate::{parse_value, FilterCondition, IResult, Span, Token};
|
||||
@ -19,6 +20,8 @@ pub enum Condition<'a> {
|
||||
GreaterThanOrEqual(Token<'a>),
|
||||
Equal(Token<'a>),
|
||||
NotEqual(Token<'a>),
|
||||
Exists,
|
||||
NotExists,
|
||||
LowerThan(Token<'a>),
|
||||
LowerThanOrEqual(Token<'a>),
|
||||
Between { from: Token<'a>, to: Token<'a> },
|
||||
@ -33,14 +36,15 @@ impl<'a> Condition<'a> {
|
||||
GreaterThanOrEqual(n) => (LowerThan(n), None),
|
||||
Equal(s) => (NotEqual(s), None),
|
||||
NotEqual(s) => (Equal(s), None),
|
||||
Exists => (NotExists, None),
|
||||
NotExists => (Exists, None),
|
||||
LowerThan(n) => (GreaterThanOrEqual(n), None),
|
||||
LowerThanOrEqual(n) => (GreaterThan(n), None),
|
||||
Between { from, to } => (LowerThan(from), Some(GreaterThan(to))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// condition = value ("==" | ">" ...) value
|
||||
/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value
|
||||
pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
|
||||
let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("=")));
|
||||
let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?;
|
||||
@ -58,10 +62,24 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
|
||||
Ok((input, condition))
|
||||
}
|
||||
|
||||
/// to = value value TO value
|
||||
/// exist = value "EXISTS"
|
||||
pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;
|
||||
|
||||
Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists }))
|
||||
}
|
||||
/// exist = value "NOT" WS+ "EXISTS"
|
||||
pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = parse_value(input)?;
|
||||
|
||||
let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?;
|
||||
Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists }))
|
||||
}
|
||||
|
||||
/// to = value value "TO" WS+ value
|
||||
pub fn parse_to(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, (key, from, _, to)) =
|
||||
tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?;
|
||||
let (input, (key, from, _, _, to)) =
|
||||
tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?;
|
||||
|
||||
Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } }))
|
||||
}
|
||||
|
@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> {
|
||||
writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)?
|
||||
}
|
||||
ErrorKind::InvalidPrimary if input.trim().is_empty() => {
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")?
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")?
|
||||
}
|
||||
ErrorKind::InvalidPrimary => {
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)?
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)?
|
||||
}
|
||||
ErrorKind::ExpectedEof => {
|
||||
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
|
||||
|
@ -1,24 +1,26 @@
|
||||
//! BNF grammar:
|
||||
//!
|
||||
//! ```text
|
||||
//! filter = expression ~ EOF
|
||||
//! filter = expression EOF
|
||||
//! expression = or
|
||||
//! or = and (~ "OR" ~ and)
|
||||
//! and = not (~ "AND" not)*
|
||||
//! not = ("NOT" ~ not) | primary
|
||||
//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to
|
||||
//! condition = value ("==" | ">" ...) value
|
||||
//! to = value value TO value
|
||||
//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS*
|
||||
//! or = and ("OR" WS+ and)*
|
||||
//! and = not ("AND" WS+ not)*
|
||||
//! not = ("NOT" WS+ not) | primary
|
||||
//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to
|
||||
//! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value
|
||||
//! exists = value "EXISTS"
|
||||
//! not_exists = value "NOT" WS+ "EXISTS"
|
||||
//! to = value value "TO" WS+ value
|
||||
//! value = WS* ( word | singleQuoted | doubleQuoted) WS+
|
||||
//! singleQuoted = "'" .* all but quotes "'"
|
||||
//! doubleQuoted = "\"" .* all but double quotes "\""
|
||||
//! word = (alphanumeric | _ | - | .)+
|
||||
//! geoRadius = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")"
|
||||
//! geoRadius = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
|
||||
//! ```
|
||||
//!
|
||||
//! Other BNF grammar used to handle some specific errors:
|
||||
//! ```text
|
||||
//! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")"
|
||||
//! geoPoint = WS* "_geoPoint(" (float ",")* ")"
|
||||
//! ```
|
||||
//!
|
||||
//! Specific errors:
|
||||
@ -29,7 +31,7 @@
|
||||
//! field < 12 AND _geoPoint(1, 2)
|
||||
//! ```
|
||||
//!
|
||||
//! - If a user try to use a geoRadius as a value we must throw an error.
|
||||
//! - If a user try to use a geoRadius as a value we must throw an error.
|
||||
//! ```text
|
||||
//! field = _geoRadius(12, 13, 14)
|
||||
//! ```
|
||||
@ -43,11 +45,12 @@ use std::fmt::Debug;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub use condition::{parse_condition, parse_to, Condition};
|
||||
use condition::{parse_exists, parse_not_exists};
|
||||
use error::{cut_with_err, NomErrorExt};
|
||||
pub use error::{Error, ErrorKind};
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag;
|
||||
use nom::character::complete::{char, multispace0};
|
||||
use nom::character::complete::{char, multispace0, multispace1};
|
||||
use nom::combinator::{cut, eof, map};
|
||||
use nom::multi::{many0, separated_list1};
|
||||
use nom::number::complete::recognize_float;
|
||||
@ -167,11 +170,11 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult<O>) -> impl FnMut(Span<'a>)
|
||||
delimited(multispace0, inner, multispace0)
|
||||
}
|
||||
|
||||
/// or = and (~ "OR" ~ and)
|
||||
/// or = and ("OR" WS+ and)*
|
||||
fn parse_or(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, lhs) = parse_and(input)?;
|
||||
// if we found a `OR` then we MUST find something next
|
||||
let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?;
|
||||
let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?;
|
||||
|
||||
let expr = ors
|
||||
.into_iter()
|
||||
@ -179,28 +182,32 @@ fn parse_or(input: Span) -> IResult<FilterCondition> {
|
||||
Ok((input, expr))
|
||||
}
|
||||
|
||||
/// and = not (~ "AND" not)*
|
||||
/// and = not ("AND" not)*
|
||||
fn parse_and(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, lhs) = parse_not(input)?;
|
||||
// if we found a `AND` then we MUST find something next
|
||||
let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?;
|
||||
let (input, ors) =
|
||||
many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?;
|
||||
let expr = ors
|
||||
.into_iter()
|
||||
.fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch)));
|
||||
Ok((input, expr))
|
||||
}
|
||||
|
||||
/// not = ("NOT" ~ not) | primary
|
||||
/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`.
|
||||
/// not = ("NOT" WS+ not) | primary
|
||||
/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`.
|
||||
/// If we parse a `NOT` we MUST parse something behind.
|
||||
fn parse_not(input: Span) -> IResult<FilterCondition> {
|
||||
alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input)
|
||||
alt((
|
||||
map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()),
|
||||
parse_primary,
|
||||
))(input)
|
||||
}
|
||||
|
||||
/// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float)
|
||||
/// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float)
|
||||
/// If we parse `_geoRadius` we MUST parse the rest of the expression.
|
||||
fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
|
||||
// we want to forbid space BEFORE the _geoRadius but not after
|
||||
// we want to allow space BEFORE the _geoRadius but not after
|
||||
let parsed = preceded(
|
||||
tuple((multispace0, tag("_geoRadius"))),
|
||||
// if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure
|
||||
@ -221,7 +228,7 @@ fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
|
||||
Ok((input, res))
|
||||
}
|
||||
|
||||
/// geoPoint = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float)
|
||||
/// geoPoint = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float)
|
||||
fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
|
||||
// we want to forbid space BEFORE the _geoPoint but not after
|
||||
tuple((
|
||||
@ -235,7 +242,7 @@ fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
|
||||
Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))
|
||||
}
|
||||
|
||||
/// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to
|
||||
/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to
|
||||
fn parse_primary(input: Span) -> IResult<FilterCondition> {
|
||||
alt((
|
||||
// if we find a first parenthesis, then we must parse an expression and find the closing parenthesis
|
||||
@ -248,6 +255,8 @@ fn parse_primary(input: Span) -> IResult<FilterCondition> {
|
||||
),
|
||||
parse_geo_radius,
|
||||
parse_condition,
|
||||
parse_exists,
|
||||
parse_not_exists,
|
||||
parse_to,
|
||||
// the next lines are only for error handling and are written at the end to have the less possible performance impact
|
||||
parse_geo_point,
|
||||
@ -261,7 +270,7 @@ pub fn parse_expression(input: Span) -> IResult<FilterCondition> {
|
||||
parse_or(input)
|
||||
}
|
||||
|
||||
/// filter = expression ~ EOF
|
||||
/// filter = expression EOF
|
||||
pub fn parse_filter(input: Span) -> IResult<FilterCondition> {
|
||||
terminated(parse_expression, eof)(input)
|
||||
}
|
||||
@ -420,6 +429,41 @@ pub mod tests {
|
||||
op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")),
|
||||
},
|
||||
),
|
||||
(
|
||||
"subscribers EXISTS",
|
||||
Fc::Condition {
|
||||
fid: rtok("", "subscribers"),
|
||||
op: Condition::Exists,
|
||||
},
|
||||
),
|
||||
(
|
||||
"NOT subscribers EXISTS",
|
||||
Fc::Condition {
|
||||
fid: rtok("NOT ", "subscribers"),
|
||||
op: Condition::NotExists,
|
||||
},
|
||||
),
|
||||
(
|
||||
"subscribers NOT EXISTS",
|
||||
Fc::Condition {
|
||||
fid: rtok("", "subscribers"),
|
||||
op: Condition::NotExists,
|
||||
},
|
||||
),
|
||||
(
|
||||
"NOT subscribers NOT EXISTS",
|
||||
Fc::Condition {
|
||||
fid: rtok("NOT ", "subscribers"),
|
||||
op: Condition::Exists,
|
||||
},
|
||||
),
|
||||
(
|
||||
"subscribers NOT EXISTS",
|
||||
Fc::Condition {
|
||||
fid: rtok("", "subscribers"),
|
||||
op: Condition::NotExists,
|
||||
},
|
||||
),
|
||||
(
|
||||
"subscribers 100 TO 1000",
|
||||
Fc::Condition {
|
||||
@ -577,10 +621,10 @@ pub mod tests {
|
||||
("channel = ", "Was expecting a value but instead got nothing."),
|
||||
("channel = 🐻", "Was expecting a value but instead got `🐻`."),
|
||||
("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."),
|
||||
("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."),
|
||||
("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."),
|
||||
("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."),
|
||||
("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."),
|
||||
("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."),
|
||||
("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."),
|
||||
("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."),
|
||||
("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."),
|
||||
("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
|
||||
("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
|
||||
("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."),
|
||||
@ -590,6 +634,10 @@ pub mod tests {
|
||||
("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."),
|
||||
("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."),
|
||||
("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."),
|
||||
("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."),
|
||||
("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."),
|
||||
("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."),
|
||||
("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."),
|
||||
];
|
||||
|
||||
for (input, expected) in test_case {
|
||||
|
@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult<Token> {
|
||||
))
|
||||
}
|
||||
|
||||
/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS*
|
||||
/// value = WS* ( word | singleQuoted | doubleQuoted) WS+
|
||||
pub fn parse_value<'a>(input: Span<'a>) -> IResult<Token<'a>> {
|
||||
// to get better diagnostic message we are going to strip the left whitespaces from the input right now
|
||||
let (input, _) = take_while(char::is_whitespace)(input)?;
|
||||
|
@ -4,7 +4,11 @@ use serde_json::{Map, Value};
|
||||
|
||||
pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
|
||||
let mut obj = Map::new();
|
||||
insert_object(&mut obj, None, json);
|
||||
let mut all_keys = vec![];
|
||||
insert_object(&mut obj, None, json, &mut all_keys);
|
||||
for key in all_keys {
|
||||
obj.entry(key).or_insert(Value::Array(vec![]));
|
||||
}
|
||||
obj
|
||||
}
|
||||
|
||||
@ -12,26 +16,32 @@ fn insert_object(
|
||||
base_json: &mut Map<String, Value>,
|
||||
base_key: Option<&str>,
|
||||
object: &Map<String, Value>,
|
||||
all_keys: &mut Vec<String>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
|
||||
|
||||
all_keys.push(new_key.clone());
|
||||
if let Some(array) = value.as_array() {
|
||||
insert_array(base_json, &new_key, array);
|
||||
insert_array(base_json, &new_key, array, all_keys);
|
||||
} else if let Some(object) = value.as_object() {
|
||||
insert_object(base_json, Some(&new_key), object);
|
||||
insert_object(base_json, Some(&new_key), object, all_keys);
|
||||
} else {
|
||||
insert_value(base_json, &new_key, value.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_array(base_json: &mut Map<String, Value>, base_key: &str, array: &Vec<Value>) {
|
||||
fn insert_array(
|
||||
base_json: &mut Map<String, Value>,
|
||||
base_key: &str,
|
||||
array: &Vec<Value>,
|
||||
all_keys: &mut Vec<String>,
|
||||
) {
|
||||
for value in array {
|
||||
if let Some(object) = value.as_object() {
|
||||
insert_object(base_json, Some(base_key), object);
|
||||
insert_object(base_json, Some(base_key), object, all_keys);
|
||||
} else if let Some(sub_array) = value.as_array() {
|
||||
insert_array(base_json, base_key, sub_array);
|
||||
insert_array(base_json, base_key, sub_array, all_keys);
|
||||
} else {
|
||||
insert_value(base_json, base_key, value.clone());
|
||||
}
|
||||
@ -103,6 +113,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": [],
|
||||
"a.b": "c",
|
||||
"a.d": "e",
|
||||
"a.f": "g"
|
||||
@ -116,6 +127,10 @@ mod tests {
|
||||
fn flatten_array() {
|
||||
let mut base: Value = json!({
|
||||
"a": [
|
||||
1,
|
||||
"b",
|
||||
[],
|
||||
[{}],
|
||||
{ "b": "c" },
|
||||
{ "b": "d" },
|
||||
{ "b": "e" },
|
||||
@ -127,6 +142,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": [1, "b"],
|
||||
"a.b": ["c", "d", "e"],
|
||||
})
|
||||
.as_object()
|
||||
@ -154,6 +170,28 @@ mod tests {
|
||||
.as_object()
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
// here we must keep 42 in "a"
|
||||
let mut base: Value = json!({
|
||||
"a": [
|
||||
{ "b": "c" },
|
||||
{ "b": "d" },
|
||||
{ "b": "e" },
|
||||
null,
|
||||
]
|
||||
});
|
||||
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||
let flat = flatten(&json);
|
||||
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": null,
|
||||
"a.b": ["c", "d", "e"],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -170,6 +208,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": [],
|
||||
"a.b": ["c", "d"],
|
||||
})
|
||||
.as_object()
|
||||
|
@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
field_id_word_count_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
exact_word_docids,
|
||||
exact_word_prefix_docids,
|
||||
field_id_docid_facet_f64s: _,
|
||||
@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
let field_id_word_count_docids_name = "field_id_word_count_docids";
|
||||
let facet_id_f64_docids_name = "facet_id_f64_docids";
|
||||
let facet_id_string_docids_name = "facet_id_string_docids";
|
||||
let facet_id_exists_docids_name = "facet_id_exists_docids";
|
||||
let documents_name = "documents";
|
||||
|
||||
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
||||
@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// List the docids where the facet exists
|
||||
let db = facet_id_exists_docids.remap_data_type::<ByteSlice>();
|
||||
for result in facet_values_iter(rtxn, db, facet_id)? {
|
||||
let (_fid, value) = result?;
|
||||
let key = facet_name.to_string();
|
||||
heap.push(Reverse((value.len(), key, facet_id_exists_docids_name)));
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for result in index.all_documents(rtxn)? {
|
||||
@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
||||
facet_id_string_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
facet_id_exists_docids,
|
||||
exact_word_prefix_docids,
|
||||
exact_word_docids,
|
||||
..
|
||||
@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
||||
FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(),
|
||||
FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(),
|
||||
FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
|
||||
FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(),
|
||||
FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
|
||||
FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
|
||||
EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
|
||||
|
@ -6,6 +6,8 @@ mod facet_string_zero_bounds_value_codec;
|
||||
mod field_doc_id_facet_f64_codec;
|
||||
mod field_doc_id_facet_string_codec;
|
||||
|
||||
use heed::types::OwnedType;
|
||||
|
||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
||||
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
||||
@ -15,6 +17,9 @@ pub use self::facet_string_level_zero_value_codec::{
|
||||
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
||||
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||
use crate::BEU16;
|
||||
|
||||
pub type FieldIdCodec = OwnedType<BEU16>;
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
|
@ -15,13 +15,13 @@ use crate::error::{InternalError, UserError};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec,
|
||||
};
|
||||
use crate::{
|
||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
||||
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
||||
Search, StrBEU32Codec, StrStrU8Codec, BEU32,
|
||||
Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32,
|
||||
};
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
@ -75,6 +75,7 @@ pub mod db_name {
|
||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
@ -116,6 +117,9 @@ pub struct Index {
|
||||
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
||||
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id and the docids for which this field exists
|
||||
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
||||
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
||||
@ -134,7 +138,7 @@ impl Index {
|
||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(16);
|
||||
options.max_dbs(17);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@ -152,6 +156,8 @@ impl Index {
|
||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||
|
||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
@ -174,6 +180,7 @@ impl Index {
|
||||
field_id_word_count_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
@ -806,6 +813,18 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id
|
||||
pub fn exists_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/* distinct field */
|
||||
|
||||
pub(crate) fn put_distinct_field(
|
||||
|
@ -44,6 +44,7 @@ pub use self::search::{
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
||||
pub type Attribute = u32;
|
||||
pub type BEU16 = heed::zerocopy::U16<heed::byteorder::BE>;
|
||||
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||
pub type DocumentId = u32;
|
||||
|
@ -280,6 +280,18 @@ impl<'a> Filter<'a> {
|
||||
Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)),
|
||||
Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)),
|
||||
Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)),
|
||||
Condition::Exists => {
|
||||
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(exist);
|
||||
}
|
||||
Condition::NotExists => {
|
||||
let all_ids = index.documents_ids(rtxn)?;
|
||||
|
||||
let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?;
|
||||
|
||||
let notexist = all_ids - exist;
|
||||
return Ok(notexist);
|
||||
}
|
||||
Condition::Equal(val) => {
|
||||
let (_original_value, string_docids) = strings_db
|
||||
.get(rtxn, &(field_id, &val.value().to_lowercase()))?
|
||||
|
@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
word_prefix_position_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
word_prefix_position_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
|
@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
word_position_docids,
|
||||
word_prefix_position_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_string_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
}
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_field_id_number_docids(
|
||||
remove_docids_from_facet_field_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_f64_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_field_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_exists_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
remove_docids_from_facet_field_id_string_docids(
|
||||
self.wtxn,
|
||||
@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_docids_from_facet_field_id_number_docids<'a, C>(
|
||||
fn remove_docids_from_facet_field_id_docids<'a, C>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
|
@ -1,15 +1,19 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
@ -20,7 +24,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
@ -39,6 +43,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
@ -46,16 +52,26 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
key_buffer.clear();
|
||||
|
||||
// prefix key with the field_id and the document_id
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
// Here, we know already that the document must be added to the “field id exists” database
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = BEU32::from(document).get();
|
||||
|
||||
facet_exists_docids.entry(field_id).or_default().insert(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(&docid_bytes);
|
||||
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
@ -77,9 +93,21 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
|
||||
facet_exists_docids_reader,
|
||||
))
|
||||
}
|
||||
|
||||
|
@ -53,7 +53,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
|
||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
@ -72,9 +72,28 @@ pub(crate) fn data_from_obkv_documents(
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
||||
(
|
||||
docid_fid_facet_numbers_chunks,
|
||||
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
|
||||
),
|
||||
) = result?;
|
||||
|
||||
// merge facet_exists_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-exists-docids");
|
||||
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer.clone(),
|
||||
@ -197,6 +216,7 @@ fn send_original_documents_data(
|
||||
/// - docid_word_positions
|
||||
/// - docid_fid_facet_numbers
|
||||
/// - docid_fid_facet_strings
|
||||
/// - docid_fid_facet_exists
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
||||
indexer: GrenadParameters,
|
||||
@ -209,7 +229,10 @@ fn send_and_extract_flattened_documents_data(
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
|
||||
),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -250,8 +273,11 @@ fn send_and_extract_flattened_documents_data(
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
|
||||
extract_fid_docid_facet_values(
|
||||
let (
|
||||
docid_fid_facet_numbers_chunk,
|
||||
docid_fid_facet_strings_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
) = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer.clone(),
|
||||
faceted_fields,
|
||||
@ -273,7 +299,10 @@ fn send_and_extract_flattened_documents_data(
|
||||
docid_fid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk))
|
||||
Ok((
|
||||
docid_fid_facet_numbers_chunk,
|
||||
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
|
||||
))
|
||||
},
|
||||
);
|
||||
|
||||
|
@ -613,6 +613,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::documents::DocumentsBatchBuilder;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::BEU16;
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -2040,6 +2041,109 @@ mod tests {
|
||||
assert_eq!(ids.len(), map.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_check_exists_database() {
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
|
||||
let faceted_fields = hashset!(S("colour"));
|
||||
let content = || {
|
||||
documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"colour": 0,
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"colour": []
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"colour": {}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"colour": null
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"colour": [1]
|
||||
},
|
||||
{
|
||||
"id": 5
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"colour": {
|
||||
"green": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"colour": {
|
||||
"green": {
|
||||
"blue": []
|
||||
}
|
||||
}
|
||||
}
|
||||
])
|
||||
};
|
||||
let make_index = || {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
Index::new(options, &path).unwrap()
|
||||
};
|
||||
|
||||
let set_filterable_fields = |index: &Index| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||
builder.set_filterable_fields(faceted_fields.clone());
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
};
|
||||
let add_documents = |index: &Index| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ())
|
||||
.unwrap();
|
||||
let (builder, user_error) = builder.add_documents(content()).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
};
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
|
||||
|
||||
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||
|
||||
let bitmap_colour =
|
||||
index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
|
||||
assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
|
||||
|
||||
let bitmap_colour_green = index
|
||||
.facet_id_exists_docids
|
||||
.get(&rtxn, &BEU16::new(colour_green_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
|
||||
};
|
||||
|
||||
let index = make_index();
|
||||
add_documents(&index);
|
||||
set_filterable_fields(&index);
|
||||
check_ok(&index);
|
||||
|
||||
let index = make_index();
|
||||
set_filterable_fields(&index);
|
||||
add_documents(&index);
|
||||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn primary_key_must_not_contain_floats() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
|
@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
|
||||
WordPairProximityDocids(grenad::Reader<File>),
|
||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
||||
GeoPoints(grenad::Reader<File>),
|
||||
}
|
||||
|
||||
@ -146,6 +147,17 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_exists_docids,
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
|
@ -1,17 +1,17 @@
|
||||
{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""}
|
||||
{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""}
|
||||
{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""}
|
||||
{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""}
|
||||
{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""}
|
||||
{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""}
|
||||
{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""}
|
||||
{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""}
|
||||
{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]}
|
||||
{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []}
|
||||
{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null}
|
||||
{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4}
|
||||
{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"}
|
||||
{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]}
|
||||
{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]}
|
||||
{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]}
|
||||
{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""}
|
||||
{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""}
|
||||
{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""}
|
||||
{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""}
|
||||
{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""}
|
||||
{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""}
|
||||
{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""}
|
||||
{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""}
|
||||
{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}}
|
||||
{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] }
|
||||
{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}}
|
||||
{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]}
|
||||
{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} }
|
||||
{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]] }
|
||||
{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16}
|
||||
{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""}
|
||||
|
@ -80,3 +80,9 @@ test_filter!(
|
||||
lower_complex_filter_2,
|
||||
vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]
|
||||
);
|
||||
test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]);
|
||||
test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]);
|
||||
test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]);
|
||||
test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]);
|
||||
|
||||
test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]);
|
||||
|
@ -9,8 +9,7 @@ use maplit::{hashmap, hashset};
|
||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object};
|
||||
use serde::Deserialize;
|
||||
use serde_json::Deserializer;
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
mod distinct;
|
||||
@ -44,6 +43,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
S("tag"),
|
||||
S("asc_desc_rank"),
|
||||
S("_geo"),
|
||||
S("opt1"),
|
||||
S("opt1.opt2")
|
||||
});
|
||||
builder.set_sortable_fields(hashset! {
|
||||
S("tag"),
|
||||
@ -65,7 +66,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
let reader = Cursor::new(CONTENT.as_bytes());
|
||||
|
||||
for result in Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result.unwrap();
|
||||
documents_builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
@ -194,12 +195,44 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
|
||||
id = (document.geo_rank < 100000).then(|| document.id.clone());
|
||||
} else if filter.starts_with("NOT _geoRadius") {
|
||||
id = (document.geo_rank > 1000000).then(|| document.id.clone());
|
||||
} else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") {
|
||||
id = document.opt1.is_some().then(|| document.id.clone());
|
||||
} else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") {
|
||||
id = document.opt1.is_none().then(|| document.id.clone());
|
||||
} else if matches!(filter, "opt1.opt2 EXISTS") {
|
||||
if document.opt1opt2.is_some() {
|
||||
id = Some(document.id.clone());
|
||||
} else if let Some(opt1) = &document.opt1 {
|
||||
id = contains_key_rec(opt1, "opt2").then(|| document.id.clone());
|
||||
}
|
||||
}
|
||||
id
|
||||
}
|
||||
|
||||
pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool {
|
||||
match v {
|
||||
serde_json::Value::Array(v) => {
|
||||
for v in v.iter() {
|
||||
if contains_key_rec(v, key) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
serde_json::Value::Object(v) => {
|
||||
for (k, v) in v.iter() {
|
||||
if k == key || contains_key_rec(v, key) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<String> {
|
||||
let dataset: HashSet<TestDocument> =
|
||||
let dataset: Vec<TestDocument> =
|
||||
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
|
||||
|
||||
let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect();
|
||||
@ -227,7 +260,7 @@ pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<S
|
||||
filtered_ids
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, PartialEq, Eq, Hash)]
|
||||
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
|
||||
pub struct TestDocument {
|
||||
pub id: String,
|
||||
pub word_rank: u32,
|
||||
@ -241,4 +274,16 @@ pub struct TestDocument {
|
||||
pub title: String,
|
||||
pub description: String,
|
||||
pub tag: String,
|
||||
#[serde(default, deserialize_with = "some_option")]
|
||||
pub opt1: Option<serde_json::Value>,
|
||||
#[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")]
|
||||
pub opt1opt2: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
fn some_option<'de, D>(deserializer: D) -> Result<Option<serde_json::Value>, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let result = serde_json::Value::deserialize(deserializer)?;
|
||||
Ok(Some(result))
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user