From 9287858997b2a0392dd5637a093f190d381b0c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Mar 2023 16:14:00 +0100 Subject: [PATCH 01/21] Introduce a new facet_id_is_null_docids database in the index --- milli/src/index.rs | 8 +++++++- milli/src/update/clear_documents.rs | 2 ++ milli/src/update/delete_documents.rs | 8 ++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a4048dfb0..ae7bd211e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -80,6 +80,7 @@ pub mod db_name { pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; + pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -130,6 +131,9 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, + /// Maps the facet field id and the docids for which this field is set as null + pub facet_id_is_null_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. pub facet_id_f64_docids: Database, FacetGroupValueCodec>, /// Maps the facet field id and ranges of strings with the docids that corresponds to them. @@ -153,7 +157,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(19); + options.max_dbs(20); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -175,6 +179,7 @@ impl Index { let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = @@ -201,6 +206,7 @@ impl Index { facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, + facet_id_is_null_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 0296bc192..7ac09a785 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, + facet_id_is_null_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -86,6 +87,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; + facet_id_is_null_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eeb67b829..7180d7d42 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -245,6 +245,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { field_id_docid_facet_strings: _, script_language_docids, facet_id_exists_docids, + facet_id_is_null_docids, documents, } = self.index; @@ -523,6 +524,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_id_exists_docids( + self.wtxn, + facet_id_is_null_docids, + &self.to_delete_docids, + )?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; Ok(DetailedDocumentDeletionResult { From 19ab4d1a159d3be2bda3bde05dcef244bdfd9393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Mar 2023 16:46:42 +0100 Subject: [PATCH 02/21] Classify the NULL fields values in the facet extractor --- .../extract/extract_fid_docid_facet_values.rs | 99 +++++++++++++------ .../src/update/index_documents/extract/mod.rs | 15 ++- 2 files changed, 78 insertions(+), 36 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 71ac330e2..5fe6a7606 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -8,7 +8,7 @@ use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer} use heed::zerocopy::AsBytes; use heed::BytesEncode; use roaring::RoaringBitmap; -use serde_json::Value; +use serde_json::{from_slice, Value}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; @@ -25,7 +25,8 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader, grenad::Reader)> +{ let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -47,6 +48,7 @@ pub fn extract_fid_docid_facet_values( ); let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -70,33 +72,40 @@ pub fn extract_fid_docid_facet_values( // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(docid_bytes); - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - - let (numbers, strings) = extract_facet_values(&value); - - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + match extract_facet_values(&value) { + FilterableValues::Null => { + facet_is_null_docids.entry(field_id).or_default().insert(document); } - } + FilterableValues::Values { numbers, strings } => { + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); - // insert normalized and original facet string in sorter - for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { - let normalised_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); + fid_docid_facet_numbers_sorter + .insert(&key_buffer, ().as_bytes())?; + } + } - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + // insert normalized and original facet string in sorter + for (normalized, original) in + strings.into_iter().filter(|(n, _)| !n.is_empty()) + { + let normalised_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); + fid_docid_facet_strings_sorter + .insert(&key_buffer, original.as_bytes())?; + } + } } } } @@ -113,14 +122,36 @@ pub fn extract_fid_docid_facet_values( } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + let mut facet_is_null_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_is_null_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; + Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + facet_is_null_docids_reader, facet_exists_docids_reader, )) } -fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { +/// Represent what a document field contains. +enum FilterableValues { + Null, + /// Represents all the numbers and strings values found in this document field. + Values { + numbers: Vec, + strings: Vec<(String, String)>, + }, +} + +fn extract_facet_values(value: &Value) -> FilterableValues { fn inner_extract_facet_values( value: &Value, can_recurse: bool, @@ -152,9 +183,13 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { } } - let mut facet_number_values = Vec::new(); - let mut facet_string_values = Vec::new(); - inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); - - (facet_number_values, facet_string_values) + match value { + Value::Null => FilterableValues::Null, + otherwise => { + let mut numbers = Vec::new(); + let mut strings = Vec::new(); + inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings); + FilterableValues::Values { numbers, strings } + } + } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f07cf79..9f9fc8f4f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -55,7 +55,7 @@ pub(crate) fn data_from_obkv_documents( .collect::>()?; #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>))))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { send_and_extract_flattened_documents_data( @@ -76,7 +76,10 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks, ( docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ( + docid_fid_facet_strings_chunks, + (facet_is_null_docids_chunks, facet_exists_docids_chunks), + ), ), ) = result?; @@ -235,7 +238,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, (grenad::Reader, grenad::Reader)), ), )> { let flattened_documents_chunk = @@ -284,6 +287,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, + fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), @@ -309,7 +313,10 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), + ( + docid_fid_facet_strings_chunk, + (fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk), + ), )) }, ); From 43ff236df8b7ad693c4cfe26d8a16dc09dc72359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Mar 2023 16:49:53 +0100 Subject: [PATCH 03/21] Write the NULL facet values in the database --- milli/src/update/index_documents/extract/mod.rs | 16 ++++++++++++++++ milli/src/update/index_documents/typed_chunk.rs | 12 ++++++++++++ 2 files changed, 28 insertions(+) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 9f9fc8f4f..c11d08405 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -99,6 +99,22 @@ pub(crate) fn data_from_obkv_documents( }); } + // merge facet_is_null_docids and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + debug!("merge {} database", "facet-id-is-null-docids"); + match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + } + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..79f2e2c55 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -39,6 +39,7 @@ pub(crate) enum TypedChunk { FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), + FieldIdFacetIsNullDocids(grenad::Reader), GeoPoints(grenad::Reader), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } @@ -161,6 +162,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => { + append_entries_into_database( + facet_id_is_null_docids, + &index.facet_id_is_null_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, From 7c0cd7172d02664af8f5f59e2520741185d5dca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Mar 2023 16:57:42 +0100 Subject: [PATCH 04/21] Introduce the NULL and NOT value NULL operator --- filter-parser/src/condition.rs | 16 ++++++++++++++++ filter-parser/src/lib.rs | 20 ++++++++++++++++---- milli/src/index.rs | 12 ++++++++++++ milli/src/search/facet/filter.rs | 4 ++++ 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 735ffec0e..834fac8b8 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -20,6 +20,7 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), + Null, Exists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), @@ -44,6 +45,21 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } +/// null = value "NULL" +pub fn parse_null(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("NULL"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key, op: Null })) +} + +/// null = value "NOT" WS+ "NULL" +pub fn parse_not_null(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("NOT"), multispace1, tag("NULL")))(input)?; + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null })))) +} + /// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 8e21ff6be..36657587f 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -47,7 +47,7 @@ mod value; use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; -use condition::{parse_exists, parse_not_exists}; +use condition::{parse_exists, parse_not_exists, parse_not_null, parse_null}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -414,6 +414,8 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_in, parse_not_in, parse_condition, + parse_null, + parse_not_null, parse_exists, parse_not_exists, parse_to, @@ -496,14 +498,23 @@ pub mod tests { insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); - // Test NOT + EXISTS - insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); + // Test NOT insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); + insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + + // Test NULL + NOT NULL + insta::assert_display_snapshot!(p("subscribers NULL"), @"{subscribers} NULL"); + insta::assert_display_snapshot!(p("NOT subscribers NULL"), @"NOT ({subscribers} NULL)"); + insta::assert_display_snapshot!(p("subscribers NOT NULL"), @"NOT ({subscribers} NULL)"); + insta::assert_display_snapshot!(p("NOT subscribers NOT NULL"), @"{subscribers} NULL"); + insta::assert_display_snapshot!(p("subscribers NOT NULL"), @"NOT ({subscribers} NULL)"); + + // Test EXISTS + NOT EXITS + insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); // Test nested NOT insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); @@ -800,6 +811,7 @@ impl<'a> std::fmt::Display for Condition<'a> { Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), Condition::Equal(token) => write!(f, "= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"), + Condition::Null => write!(f, "NULL"), Condition::Exists => write!(f, "EXISTS"), Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), diff --git a/milli/src/index.rs b/milli/src/index.rs index ae7bd211e..3316028df 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -839,6 +839,18 @@ impl Index { } } + /// Retrieve all the documents which contain this field id set as null + pub fn null_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /// Retrieve all the documents which contain this field id pub fn exists_faceted_documents_ids( &self, diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a4ac53950..df42725c5 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -219,6 +219,10 @@ impl<'a> Filter<'a> { Condition::Between { from, to } => { (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) } + Condition::Null => { + let is_null = index.null_faceted_documents_ids(rtxn, field_id)?; + return Ok(is_null); + } Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); From 7dc04747fd94e8b43d15584e351ecb1489ab2fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Mar 2023 17:37:08 +0100 Subject: [PATCH 05/21] Make clippy happy --- .../index_documents/extract/extract_fid_docid_facet_values.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 5fe6a7606..0589dc773 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -21,6 +21,7 @@ use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET /// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// and the normalized value as value extracted from the given chunk of documents. #[logging_timer::time] +#[allow(clippy::type_complexity)] pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, From b1d61f5a02f50cdb9f34d23f192f953e155d30ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 10:04:27 +0100 Subject: [PATCH 06/21] Add more tests for the NULL filter --- milli/tests/search/filters.rs | 7 +++++++ milli/tests/search/mod.rs | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 18de24ac3..0b5296b82 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -82,10 +82,17 @@ test_filter!( vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] ); test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); +test_filter!(null_filter_1, vec![Right("opt1 NULL")]); +test_filter!(null_filter_2, vec![Right("opt1.opt2 NULL")]); +test_filter!(null_filter_1_not, vec![Right("opt1 NOT NULL")]); +test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 NULL")]); +test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 NOT NULL")]); + test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 18c74e344..e67c1bc64 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -205,6 +205,18 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if let Some(opt1) = &document.opt1 { id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); } + } else if matches!(filter, "opt1 NULL" | "NOT opt1 NOT NULL") { + id = document.opt1.as_ref().map_or(false, |v| v.is_null()).then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 NULL" | "opt1 NOT NULL") { + id = document.opt1.as_ref().map_or(true, |v| !v.is_null()).then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 NULL") { + if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + if !opt1.is_null() { + id = contains_null_rec(opt1, "opt2").then(|| document.id.clone()); + } + } } else if matches!( filter, "tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]" @@ -240,6 +252,28 @@ pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { } } +pub fn contains_null_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key && v.is_null() || contains_null_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_null_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); From e106b16148ba7231af3045be5e8384813373b509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 13:01:43 +0100 Subject: [PATCH 07/21] Fix a typo in a variable Co-authored-by: Louis Dureuil aaa --- .../index_documents/extract/extract_fid_docid_facet_values.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 0589dc773..be7b44eee 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -95,14 +95,14 @@ pub fn extract_fid_docid_facet_values( for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { - let normalised_truncated_value: String = normalized + let normalized_truncated_value: String = normalized .char_indices() .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) .map(|(_, c)| c) .collect(); key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); + key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); fid_docid_facet_strings_sorter .insert(&key_buffer, original.as_bytes())?; } From e064c52544e97dbad7bc43732fc5301db6ea1988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 13:05:54 +0100 Subject: [PATCH 08/21] Rename an internal facet deletion method --- milli/src/update/delete_documents.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 7180d7d42..bb232d7cc 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -518,14 +518,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_exists_docids( + remove_docids_from_facet_id_docids( self.wtxn, facet_id_exists_docids, &self.to_delete_docids, )?; // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_exists_docids( + remove_docids_from_facet_id_docids( self.wtxn, facet_id_is_null_docids, &self.to_delete_docids, @@ -633,7 +633,7 @@ fn remove_docids_from_field_id_docid_facet_value( Ok(all_affected_facet_values) } -fn remove_docids_from_facet_id_exists_docids<'a, C>( +fn remove_docids_from_facet_id_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, From 0ad53784e73b6c2da743d9cafee999002a191f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 13:21:21 +0100 Subject: [PATCH 09/21] Create a new struct to reduce the type complexity --- .../extract/extract_fid_docid_facet_values.rs | 24 ++++++++++++------- .../src/update/index_documents/extract/mod.rs | 6 ++--- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index be7b44eee..6460af812 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -16,18 +16,24 @@ use crate::facet::value_encoding::f64_into_bytes; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; +/// The extracted facet values stored in grenad files by type. +pub struct ExtractedFacetValues { + pub docid_fid_facet_numbers_chunk: grenad::Reader, + pub docid_fid_facet_strings_chunk: grenad::Reader, + pub fid_facet_is_null_docids_chunk: grenad::Reader, + pub fid_facet_exists_docids_chunk: grenad::Reader, +} + /// Extracts the facet values of each faceted field of each document. /// /// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// and the normalized value as value extracted from the given chunk of documents. #[logging_timer::time] -#[allow(clippy::type_complexity)] pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader, grenad::Reader)> -{ +) -> Result { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -134,12 +140,12 @@ pub fn extract_fid_docid_facet_values( } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; - Ok(( - sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, - facet_is_null_docids_reader, - facet_exists_docids_reader, - )) + Ok(ExtractedFacetValues { + docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, + fid_facet_exists_docids_chunk: facet_exists_docids_reader, + }) } /// Represent what a document field contains. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c11d08405..4a5c9b64c 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -18,7 +18,7 @@ use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; -use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; +use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues}; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; @@ -300,12 +300,12 @@ fn send_and_extract_flattened_documents_data( Ok(docid_word_positions_chunk) }, || { - let ( + let ExtractedFacetValues { docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk, - ) = extract_fid_docid_facet_values( + } = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer, faceted_fields, From ff86073288ce407e25710af8d0c9cad4ae80bacf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 13:32:27 +0100 Subject: [PATCH 10/21] Add a snapshot for the NULL facet database --- milli/src/snapshot_tests.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..c6ea8f3dd 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -271,6 +271,11 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String { &format!("{facet_id:<3} {}", display_bitmap(&docids)) }) } +pub fn snap_facet_id_is_null_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }) +} pub fn snap_facet_id_string_docids(index: &Index) -> String { make_db_snap_from_iter!(index, facet_id_string_docids, |( FacetGroupKey { field_id, level, left_bound }, @@ -495,6 +500,9 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_exists_docids) => {{ $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; + ($index:ident, facet_id_is_null_docids) => {{ + $crate::snapshot_tests::snap_facet_id_is_null_docids(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; From df48ac8803b908c9f080b5481930983103f63a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Mar 2023 13:53:37 +0100 Subject: [PATCH 11/21] Add one more test for the NULL operator --- milli/src/update/index_documents/mod.rs | 108 ++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2a7930f84..7b9bd7834 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1758,6 +1758,114 @@ mod tests { check_ok(&index); } + #[test] + fn index_documents_check_is_null_database() { + let content = || { + documents!([ + { + "id": 0, + "colour": null, + }, + { + "id": 6, + "colour": { + "green": null + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": null + } + } + }, + { + "id": 8, + "colour": 0, + }, + { + "id": 9, + "colour": [] + }, + { + "id": 10, + "colour": {} + }, + { + "id": 12, + "colour": [1] + }, + { + "id": 13 + }, + { + "id": 14, + "colour": { + "green": 1 + } + }, + { + "id": 15, + "colour": { + "green": { + "blue": [] + } + } + } + ]) + }; + + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + let colour_blue_id = + index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap(); + + let bitmap_null_colour = + index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); + assert_eq!(bitmap_null_colour.into_iter().collect::>(), vec![0]); + + let bitmap_colour_green = index + .facet_id_is_null_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![1]); + + let bitmap_colour_blue = index + .facet_id_is_null_docids + .get(&rtxn, &BEU16::new(colour_blue_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![2]); + }; + + let faceted_fields = hashset!(S("colour")); + + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + check_ok(&index); + + let index = TempIndex::new(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + index.add_documents(content()).unwrap(); + check_ok(&index); + } + #[test] fn primary_key_must_not_contain_floats() { let index = TempIndex::new_with_map_size(4096 * 100); From c25779afba1e7fadd9365422db87f2e0b5ec3635 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Mar 2023 17:40:34 +0100 Subject: [PATCH 12/21] Specify that the NULL keyword is a keyword too --- filter-parser/src/value.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 2296c0769..9a0a5e710 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -178,7 +178,10 @@ fn is_syntax_component(c: char) -> bool { } fn is_keyword(s: &str) -> bool { - matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius" | "_geoBoundingBox") + matches!( + s, + "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "NULL" | "_geoRadius" | "_geoBoundingBox" + ) } #[cfg(test)] From 030263caa36c73e4d9a178621483231718f0e2e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 14 Mar 2023 10:31:04 +0100 Subject: [PATCH 13/21] Change the IS NULL filter syntax to use the IS keyword --- filter-parser/src/condition.rs | 13 +++++++------ filter-parser/src/lib.rs | 8 ++++---- filter-parser/src/value.rs | 11 ++++++++++- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 834fac8b8..fe424539f 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -45,18 +45,19 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// null = value "NULL" -pub fn parse_null(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("NULL"))(input)?; +/// null = value "IS" WS+ "NULL" +pub fn parse_is_null(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + let (input, _) = tuple((tag("IS"), multispace1, tag("NULL")))(input)?; Ok((input, FilterCondition::Condition { fid: key, op: Null })) } -/// null = value "NOT" WS+ "NULL" -pub fn parse_not_null(input: Span) -> IResult { +/// null = value "IS" WS+ "NOT" WS+ "NULL" +pub fn parse_is_not_null(input: Span) -> IResult { let (input, key) = parse_value(input)?; - let (input, _) = tuple((tag("NOT"), multispace1, tag("NULL")))(input)?; + let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("NULL")))(input)?; Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null })))) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 36657587f..513da07c5 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -47,7 +47,7 @@ mod value; use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; -use condition::{parse_exists, parse_not_exists, parse_not_null, parse_null}; +use condition::{parse_exists, parse_is_not_null, parse_is_null, parse_not_exists}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -414,8 +414,8 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_in, parse_not_in, parse_condition, - parse_null, - parse_not_null, + parse_is_null, + parse_is_not_null, parse_exists, parse_not_exists, parse_to, @@ -811,7 +811,7 @@ impl<'a> std::fmt::Display for Condition<'a> { Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), Condition::Equal(token) => write!(f, "= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"), - Condition::Null => write!(f, "NULL"), + Condition::Null => write!(f, "IS NULL"), Condition::Exists => write!(f, "EXISTS"), Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 9a0a5e710..f8f1c43bc 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -180,7 +180,16 @@ fn is_syntax_component(c: char) -> bool { fn is_keyword(s: &str) -> bool { matches!( s, - "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "NULL" | "_geoRadius" | "_geoBoundingBox" + "AND" + | "OR" + | "IN" + | "NOT" + | "TO" + | "EXISTS" + | "IS" + | "NULL" + | "_geoRadius" + | "_geoBoundingBox" ) } From fa2ea4a3793ce1cd78d4d4f1b38f94f7273fb5c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 14 Mar 2023 10:31:27 +0100 Subject: [PATCH 14/21] Update the test to accept the new IS syntax --- filter-parser/src/error.rs | 4 ++-- filter-parser/src/lib.rs | 20 ++++++++++---------- meilisearch/tests/search/errors.rs | 4 ++-- milli/tests/search/filters.rs | 10 +++++----- milli/tests/search/mod.rs | 6 +++--- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 4d9d89859..fc6ad8f6d 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -144,10 +144,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 513da07c5..c75ada205 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -503,11 +503,11 @@ pub mod tests { insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); // Test NULL + NOT NULL - insta::assert_display_snapshot!(p("subscribers NULL"), @"{subscribers} NULL"); - insta::assert_display_snapshot!(p("NOT subscribers NULL"), @"NOT ({subscribers} NULL)"); - insta::assert_display_snapshot!(p("subscribers NOT NULL"), @"NOT ({subscribers} NULL)"); - insta::assert_display_snapshot!(p("NOT subscribers NOT NULL"), @"{subscribers} NULL"); - insta::assert_display_snapshot!(p("subscribers NOT NULL"), @"NOT ({subscribers} NULL)"); + insta::assert_display_snapshot!(p("subscribers IS NULL"), @"{subscribers} IS NULL"); + insta::assert_display_snapshot!(p("NOT subscribers IS NULL"), @"NOT ({subscribers} IS NULL)"); + insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); + insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL"); + insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); // Test EXISTS + NOT EXITS insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); @@ -587,7 +587,7 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("'OR'"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. 1:5 'OR' "###); @@ -597,12 +597,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("channel Ponce"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. 1:14 channel Ponce "###); insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. 19:19 channel = Ponce OR "###); @@ -667,12 +667,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 9e4dbdcf5..ab42700f3 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 0b5296b82..57ad6a40b 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -87,11 +87,11 @@ test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); -test_filter!(null_filter_1, vec![Right("opt1 NULL")]); -test_filter!(null_filter_2, vec![Right("opt1.opt2 NULL")]); -test_filter!(null_filter_1_not, vec![Right("opt1 NOT NULL")]); -test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 NULL")]); -test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 NOT NULL")]); +test_filter!(null_filter_1, vec![Right("opt1 IS NULL")]); +test_filter!(null_filter_2, vec![Right("opt1.opt2 IS NULL")]); +test_filter!(null_filter_1_not, vec![Right("opt1 IS NOT NULL")]); +test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 IS NULL")]); +test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 IS NOT NULL")]); test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index e67c1bc64..51852cced 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -205,11 +205,11 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if let Some(opt1) = &document.opt1 { id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); } - } else if matches!(filter, "opt1 NULL" | "NOT opt1 NOT NULL") { + } else if matches!(filter, "opt1 IS NULL" | "NOT opt1 IS NOT NULL") { id = document.opt1.as_ref().map_or(false, |v| v.is_null()).then(|| document.id.clone()); - } else if matches!(filter, "NOT opt1 NULL" | "opt1 NOT NULL") { + } else if matches!(filter, "NOT opt1 IS NULL" | "opt1 IS NOT NULL") { id = document.opt1.as_ref().map_or(true, |v| !v.is_null()).then(|| document.id.clone()); - } else if matches!(filter, "opt1.opt2 NULL") { + } else if matches!(filter, "opt1.opt2 IS NULL") { if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { id = Some(document.id.clone()); } else if let Some(opt1) = &document.opt1 { From ea016d97afd2dfdae2fe15a12a7bfd6554d3a097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 14 Mar 2023 18:08:12 +0100 Subject: [PATCH 15/21] Implementing an IS EMPTY filter --- filter-parser/src/condition.rs | 17 +++++ filter-parser/src/error.rs | 6 +- filter-parser/src/lib.rs | 25 +++++-- filter-parser/src/value.rs | 1 + meilisearch/tests/search/errors.rs | 4 +- milli/src/index.rs | 20 +++++- milli/src/search/facet/filter.rs | 4 ++ milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 8 +++ .../extract/extract_fid_docid_facet_values.rs | 29 +++++++-- .../src/update/index_documents/extract/mod.rs | 65 +++++++++++++------ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 12 files changed, 156 insertions(+), 37 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index fe424539f..9abe1c6ea 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -21,6 +21,7 @@ pub enum Condition<'a> { Equal(Token<'a>), NotEqual(Token<'a>), Null, + Empty, Exists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), @@ -61,6 +62,22 @@ pub fn parse_is_not_null(input: Span) -> IResult { Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null })))) } +/// empty = value "IS" WS+ "EMPTY" +pub fn parse_is_empty(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("EMPTY")))(input)?; + Ok((input, FilterCondition::Condition { fid: key, op: Empty })) +} + +/// empty = value "IS" WS+ "NOT" WS+ "EMPTY" +pub fn parse_is_not_empty(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("EMPTY")))(input)?; + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Empty })))) +} + /// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index fc6ad8f6d..cbb83c841 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -143,11 +143,9 @@ impl<'a> Display for Error<'a> { ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } - ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")? - } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? + let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index c75ada205..69eb6700f 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -47,7 +47,10 @@ mod value; use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; -use condition::{parse_exists, parse_is_not_null, parse_is_null, parse_not_exists}; +use condition::{ + parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, parse_is_null, + parse_not_exists, +}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -416,6 +419,8 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_condition, parse_is_null, parse_is_not_null, + parse_is_empty, + parse_is_not_empty, parse_exists, parse_not_exists, parse_to, @@ -509,6 +514,13 @@ pub mod tests { insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL"); insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); + // Test EMPTY + NOT EMPTY + insta::assert_display_snapshot!(p("subscribers IS EMPTY"), @"{subscribers} IS EMPTY"); + insta::assert_display_snapshot!(p("NOT subscribers IS EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + insta::assert_display_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY"); + insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + // Test EXISTS + NOT EXITS insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); @@ -587,7 +599,7 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("'OR'"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. 1:5 'OR' "###); @@ -597,12 +609,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("channel Ponce"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. 1:14 channel Ponce "###); insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. 19:19 channel = Ponce OR "###); @@ -667,12 +679,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); @@ -812,6 +824,7 @@ impl<'a> std::fmt::Display for Condition<'a> { Condition::Equal(token) => write!(f, "= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"), Condition::Null => write!(f, "IS NULL"), + Condition::Empty => write!(f, "IS EMPTY"), Condition::Exists => write!(f, "EXISTS"), Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index f8f1c43bc..518c0a25a 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -188,6 +188,7 @@ fn is_keyword(s: &str) -> bool { | "EXISTS" | "IS" | "NULL" + | "EMPTY" | "_geoRadius" | "_geoBoundingBox" ) diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index ab42700f3..2a0e4a39d 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" diff --git a/milli/src/index.rs b/milli/src/index.rs index 3316028df..c60857bd0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -81,6 +81,7 @@ pub mod db_name { pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; + pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -130,9 +131,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, - /// Maps the facet field id and the docids for which this field is set as null pub facet_id_is_null_docids: Database, + /// Maps the facet field id and the docids for which this field is considered empty + pub facet_id_is_empty_docids: Database, /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. pub facet_id_f64_docids: Database, FacetGroupValueCodec>, @@ -157,7 +159,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(20); + options.max_dbs(21); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -180,6 +182,7 @@ impl Index { let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; + let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = @@ -207,6 +210,7 @@ impl Index { facet_id_string_docids, facet_id_exists_docids, facet_id_is_null_docids, + facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -851,6 +855,18 @@ impl Index { } } + /// Retrieve all the documents which contain this field id and that is considered empty + pub fn empty_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /// Retrieve all the documents which contain this field id pub fn exists_faceted_documents_ids( &self, diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index df42725c5..0c11b737e 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -223,6 +223,10 @@ impl<'a> Filter<'a> { let is_null = index.null_faceted_documents_ids(rtxn, field_id)?; return Ok(is_null); } + Condition::Empty => { + let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?; + return Ok(is_empty); + } Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 7ac09a785..326e0825d 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -35,6 +35,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_string_docids, facet_id_exists_docids, facet_id_is_null_docids, + facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -88,6 +89,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; facet_id_is_null_docids.clear(self.wtxn)?; + facet_id_is_empty_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bb232d7cc..6f2fa5e5a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -246,6 +246,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { script_language_docids, facet_id_exists_docids, facet_id_is_null_docids, + facet_id_is_empty_docids, documents, } = self.index; @@ -531,6 +532,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_id_docids( + self.wtxn, + facet_id_is_empty_docids, + &self.to_delete_docids, + )?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; Ok(DetailedDocumentDeletionResult { diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 6460af812..8f3d9408d 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -21,6 +21,7 @@ pub struct ExtractedFacetValues { pub docid_fid_facet_numbers_chunk: grenad::Reader, pub docid_fid_facet_strings_chunk: grenad::Reader, pub fid_facet_is_null_docids_chunk: grenad::Reader, + pub fid_facet_is_empty_docids_chunk: grenad::Reader, pub fid_facet_exists_docids_chunk: grenad::Reader, } @@ -56,6 +57,7 @@ pub fn extract_fid_docid_facet_values( let mut facet_exists_docids = BTreeMap::::new(); let mut facet_is_null_docids = BTreeMap::::new(); + let mut facet_is_empty_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -80,10 +82,14 @@ pub fn extract_fid_docid_facet_values( key_buffer.extend_from_slice(docid_bytes); let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + match extract_facet_values(&value) { FilterableValues::Null => { facet_is_null_docids.entry(field_id).or_default().insert(document); } + FilterableValues::Empty => { + facet_is_empty_docids.entry(field_id).or_default().insert(document); + } FilterableValues::Values { numbers, strings } => { // insert facet numbers in sorter for number in numbers { @@ -140,22 +146,34 @@ pub fn extract_fid_docid_facet_values( } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; + let mut facet_is_empty_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_is_empty_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; + Ok(ExtractedFacetValues { docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, + fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, fid_facet_exists_docids_chunk: facet_exists_docids_reader, }) } /// Represent what a document field contains. enum FilterableValues { + /// Corresponds to the JSON `null` value. Null, + /// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`. + Empty, /// Represents all the numbers and strings values found in this document field. - Values { - numbers: Vec, - strings: Vec<(String, String)>, - }, + Values { numbers: Vec, strings: Vec<(String, String)> }, } fn extract_facet_values(value: &Value) -> FilterableValues { @@ -192,6 +210,9 @@ fn extract_facet_values(value: &Value) -> FilterableValues { match value { Value::Null => FilterableValues::Null, + Value::String(s) if s.is_empty() => FilterableValues::Empty, + Value::Array(a) if a.is_empty() => FilterableValues::Empty, + Value::Object(o) if o.is_empty() => FilterableValues::Empty, otherwise => { let mut numbers = Vec::new(); let mut strings = Vec::new(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4a5c9b64c..641a8a210 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -55,22 +55,23 @@ pub(crate) fn data_from_obkv_documents( .collect::>()?; #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>))))> = flattened_obkv_chunks - .par_bridge() - .map(|flattened_obkv_chunks| { - send_and_extract_flattened_documents_data( - flattened_obkv_chunks, - indexer, - lmdb_writer_sx.clone(), - &searchable_fields, - &faceted_fields, - primary_key_id, - geo_fields_ids, - &stop_words, - max_positions_per_attributes, - ) - }) - .collect(); + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> = + flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + primary_key_id, + geo_fields_ids, + &stop_words, + max_positions_per_attributes, + ) + }) + .collect(); let ( docid_word_positions_chunks, @@ -78,7 +79,10 @@ pub(crate) fn data_from_obkv_documents( docid_fid_facet_numbers_chunks, ( docid_fid_facet_strings_chunks, - (facet_is_null_docids_chunks, facet_exists_docids_chunks), + ( + facet_is_null_docids_chunks, + (facet_is_empty_docids_chunks, facet_exists_docids_chunks), + ), ), ), ) = result?; @@ -115,6 +119,22 @@ pub(crate) fn data_from_obkv_documents( }); } + // merge facet_is_empty_docids and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + debug!("merge {} database", "facet-id-is-empty-docids"); + match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + } + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer, @@ -254,7 +274,10 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, (grenad::Reader, grenad::Reader)), + ( + grenad::Reader, + (grenad::Reader, (grenad::Reader, grenad::Reader)), + ), ), )> { let flattened_documents_chunk = @@ -304,6 +327,7 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, fid_facet_is_null_docids_chunk, + fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk, } = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), @@ -331,7 +355,10 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_numbers_chunk, ( docid_fid_facet_strings_chunk, - (fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk), + ( + fid_facet_is_null_docids_chunk, + (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), + ), ), )) }, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 79f2e2c55..e1fc01ca9 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -40,6 +40,7 @@ pub(crate) enum TypedChunk { FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), FieldIdFacetIsNullDocids(grenad::Reader), + FieldIdFacetIsEmptyDocids(grenad::Reader), GeoPoints(grenad::Reader), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } @@ -173,6 +174,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => { + append_entries_into_database( + facet_id_is_empty_docids, + &index.facet_id_is_empty_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, From d5881519cb5b364b42f2b2780cea48752c1ec79c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 Mar 2023 11:01:51 +0100 Subject: [PATCH 16/21] Make the json flattener return the original values --- flatten-serde-json/src/lib.rs | 64 ++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index e1b2b20c7..b2e36d5b3 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -3,45 +3,45 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { - let mut obj = Map::new(); - let mut all_keys = vec![]; - insert_object(&mut obj, None, json, &mut all_keys); - for key in all_keys { - obj.entry(key).or_insert(Value::Array(vec![])); + let mut obj = json.clone(); + let mut all_entries = vec![]; + insert_object(&mut obj, None, json, &mut all_entries); + for (key, old_val) in all_entries { + obj.entry(key).or_insert(old_val.clone()); } obj } -fn insert_object( +fn insert_object<'a>( base_json: &mut Map, base_key: Option<&str>, - object: &Map, - all_keys: &mut Vec, + object: &'a Map, + all_entries: &mut Vec<(String, &'a Value)>, ) { for (key, value) in object { let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); - all_keys.push(new_key.clone()); + all_entries.push((new_key.clone(), value)); if let Some(array) = value.as_array() { - insert_array(base_json, &new_key, array, all_keys); + insert_array(base_json, &new_key, array, all_entries); } else if let Some(object) = value.as_object() { - insert_object(base_json, Some(&new_key), object, all_keys); + insert_object(base_json, Some(&new_key), object, all_entries); } else { insert_value(base_json, &new_key, value.clone()); } } } -fn insert_array( +fn insert_array<'a>( base_json: &mut Map, base_key: &str, - array: &Vec, - all_keys: &mut Vec, + array: &'a Vec, + all_entries: &mut Vec<(String, &'a Value)>, ) { for value in array { if let Some(object) = value.as_object() { - insert_object(base_json, Some(base_key), object, all_keys); + insert_object(base_json, Some(base_key), object, all_entries); } else if let Some(sub_array) = value.as_array() { - insert_array(base_json, base_key, sub_array, all_keys); + insert_array(base_json, base_key, sub_array, all_entries); } else { insert_value(base_json, base_key, value.clone()); } @@ -302,4 +302,36 @@ mod tests { .unwrap() ); } + + #[test] + fn flatten_nested_values_keep_original_values() { + let mut base: Value = json!({ + "tags": { + "t1": "v1" + }, + "prices": { + "p1": [null] + } + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!("{}", serde_json::to_string_pretty(&flat).unwrap()); + + assert_eq!( + &flat, + json!({ + "tags": { + "t1": "v1" + }, + "tags.t1": "v1", + "prices": { + "p1": [null] + }, + "prices.p1": [null] + }) + .as_object() + .unwrap() + ); + } } From 72123c458b91fcf349d9b186bedfa05fb165934a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 Mar 2023 11:20:34 +0100 Subject: [PATCH 17/21] Fix the tests to make flattening work --- flatten-serde-json/src/lib.rs | 68 +++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index b2e36d5b3..c3346e15e 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -3,7 +3,7 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { - let mut obj = json.clone(); + let mut obj = Map::new(); let mut all_entries = vec![]; insert_object(&mut obj, None, json, &mut all_entries); for (key, old_val) in all_entries { @@ -26,7 +26,7 @@ fn insert_object<'a>( } else if let Some(object) = value.as_object() { insert_object(base_json, Some(&new_key), object, all_entries); } else { - insert_value(base_json, &new_key, value.clone()); + insert_value(base_json, &new_key, value.clone(), false); } } } @@ -43,12 +43,17 @@ fn insert_array<'a>( } else if let Some(sub_array) = value.as_array() { insert_array(base_json, base_key, sub_array, all_entries); } else { - insert_value(base_json, base_key, value.clone()); + insert_value(base_json, base_key, value.clone(), true); } } } -fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) { +fn insert_value( + base_json: &mut Map, + key: &str, + to_insert: Value, + came_from_array: bool, +) { debug_assert!(!to_insert.is_object()); debug_assert!(!to_insert.is_array()); @@ -63,6 +68,8 @@ fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) base_json[key] = Value::Array(vec![value, to_insert]); } // if it does not exist we can push the value untouched + } else if came_from_array { + base_json.insert(key.to_string(), Value::Array(vec![to_insert])); } else { base_json.insert(key.to_string(), to_insert); } @@ -113,7 +120,11 @@ mod tests { assert_eq!( &flat, json!({ - "a": [], + "a": { + "b": "c", + "d": "e", + "f": "g" + }, "a.b": "c", "a.d": "e", "a.f": "g" @@ -164,7 +175,7 @@ mod tests { assert_eq!( &flat, json!({ - "a": 42, + "a": [42], "a.b": ["c", "d", "e"], }) .as_object() @@ -186,7 +197,7 @@ mod tests { assert_eq!( &flat, json!({ - "a": null, + "a": [null], "a.b": ["c", "d", "e"], }) .as_object() @@ -208,7 +219,9 @@ mod tests { assert_eq!( &flat, json!({ - "a": [], + "a": { + "b": "c" + }, "a.b": ["c", "d"], }) .as_object() @@ -234,7 +247,7 @@ mod tests { json!({ "a.b": ["c", "d", "f"], "a.c": "e", - "a": 35, + "a": [35], }) .as_object() .unwrap() @@ -310,8 +323,10 @@ mod tests { "t1": "v1" }, "prices": { - "p1": [null] - } + "p1": [null], + "p1000": {"tamo": {"le": {}}} + }, + "kiki": [[]] }); let json = std::mem::take(base.as_object_mut().unwrap()); let flat = flatten(&json); @@ -321,14 +336,29 @@ mod tests { assert_eq!( &flat, json!({ - "tags": { - "t1": "v1" - }, - "tags.t1": "v1", - "prices": { - "p1": [null] - }, - "prices.p1": [null] + "prices": { + "p1": [null], + "p1000": { + "tamo": { + "le": {} + } + } + }, + "prices.p1": [null], + "prices.p1000": { + "tamo": { + "le": {} + } + }, + "prices.p1000.tamo": { + "le": {} + }, + "prices.p1000.tamo.le": {}, + "tags": { + "t1": "v1" + }, + "tags.t1": "v1", + "kiki": [[]] }) .as_object() .unwrap() From 64571c8288b358096fff5bd992d2ee168516e4dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 Mar 2023 14:57:17 +0100 Subject: [PATCH 18/21] Improve the testing of the filters --- milli/src/snapshot_tests.rs | 8 +++ milli/src/update/index_documents/mod.rs | 77 ++++++++++++++++++++++++- milli/tests/search/filters.rs | 6 ++ milli/tests/search/mod.rs | 51 ++++++++-------- 4 files changed, 114 insertions(+), 28 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c6ea8f3dd..85d1bc626 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -276,6 +276,11 @@ pub fn snap_facet_id_is_null_docids(index: &Index) -> String { &format!("{facet_id:<3} {}", display_bitmap(&docids)) }) } +pub fn snap_facet_id_is_empty_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }) +} pub fn snap_facet_id_string_docids(index: &Index) -> String { make_db_snap_from_iter!(index, facet_id_string_docids, |( FacetGroupKey { field_id, level, left_bound }, @@ -503,6 +508,9 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_is_null_docids) => {{ $crate::snapshot_tests::snap_facet_id_is_null_docids(&$index) }}; + ($index:ident, facet_id_is_empty_docids) => {{ + $crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b9bd7834..a0bf1400d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1766,6 +1766,10 @@ mod tests { "id": 0, "colour": null, }, + { + "id": 1, + "colour": [null], // must not be returned + }, { "id": 6, "colour": { @@ -1835,14 +1839,14 @@ mod tests { .get(&rtxn, &BEU16::new(colour_green_id)) .unwrap() .unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![1]); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![2]); let bitmap_colour_blue = index .facet_id_is_null_docids .get(&rtxn, &BEU16::new(colour_blue_id)) .unwrap() .unwrap(); - assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![2]); + assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; let faceted_fields = hashset!(S("colour")); @@ -1866,6 +1870,75 @@ mod tests { check_ok(&index); } + #[test] + fn index_documents_check_is_empty_database() { + let content = || { + documents!([ + {"id": 0, "tags": null }, + {"id": 1, "tags": [null] }, + {"id": 2, "tags": [] }, + {"id": 3, "tags": ["hello","world"] }, + {"id": 4, "tags": [""] }, + {"id": 5 }, + {"id": 6, "tags": {} }, + {"id": 7, "tags": {"green": "cool"} }, + {"id": 8, "tags": {"green": ""} }, + {"id": 9, "tags": "" }, + {"id": 10, "tags": { "green": null } }, + {"id": 11, "tags": { "green": { "blue": null } } }, + {"id": 12, "tags": { "green": { "blue": [] } } } + ]) + }; + + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); + + let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); + let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); + let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap(); + + let bitmap_empty_tags = + index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap(); + assert_eq!(bitmap_empty_tags.into_iter().collect::>(), vec![2, 6, 9]); + + let bitmap_tags_green = index + .facet_id_is_empty_docids + .get(&rtxn, &BEU16::new(tags_green_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_tags_green.into_iter().collect::>(), vec![8]); + + let bitmap_tags_blue = index + .facet_id_is_empty_docids + .get(&rtxn, &BEU16::new(tags_blue_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); + }; + + let faceted_fields = hashset!(S("tags")); + + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + check_ok(&index); + + let index = TempIndex::new(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + index.add_documents(content()).unwrap(); + check_ok(&index); + } + #[test] fn primary_key_must_not_contain_floats() { let index = TempIndex::new_with_map_size(4096 * 100); diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 57ad6a40b..db5a004e0 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -93,6 +93,12 @@ test_filter!(null_filter_1_not, vec![Right("opt1 IS NOT NULL")]); test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 IS NULL")]); test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 IS NOT NULL")]); +test_filter!(empty_filter_1, vec![Right("opt1 IS EMPTY")]); +test_filter!(empty_filter_2, vec![Right("opt1.opt2 IS EMPTY")]); +test_filter!(empty_filter_1_not, vec![Right("opt1 IS NOT EMPTY")]); +test_filter!(empty_filter_1_not_alt, vec![Right("NOT opt1 IS EMPTY")]); +test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]); + test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 51852cced..23744c005 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -212,10 +212,22 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if matches!(filter, "opt1.opt2 IS NULL") { if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { id = Some(document.id.clone()); - } else if let Some(opt1) = &document.opt1 { - if !opt1.is_null() { - id = contains_null_rec(opt1, "opt2").then(|| document.id.clone()); - } + } + } else if matches!(filter, "opt1 IS EMPTY" | "NOT opt1 IS NOT EMPTY") { + id = document + .opt1 + .as_ref() + .map_or(false, |v| is_empty_value(v)) + .then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 IS EMPTY" | "opt1 IS NOT EMPTY") { + id = document + .opt1 + .as_ref() + .map_or(true, |v| !is_empty_value(v)) + .then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 IS EMPTY") { + if document.opt1opt2.as_ref().map_or(false, |v| is_empty_value(v)) { + id = Some(document.id.clone()); } } else if matches!( filter, @@ -230,6 +242,15 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id } +pub fn is_empty_value(v: &serde_json::Value) -> bool { + match v { + serde_json::Value::String(s) => s.is_empty(), + serde_json::Value::Array(a) => a.is_empty(), + serde_json::Value::Object(o) => o.is_empty(), + _ => false, + } +} + pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { match v { serde_json::Value::Array(v) => { @@ -252,28 +273,6 @@ pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { } } -pub fn contains_null_rec(v: &serde_json::Value, key: &str) -> bool { - match v { - serde_json::Value::Object(v) => { - for (k, v) in v.iter() { - if k == key && v.is_null() || contains_null_rec(v, key) { - return true; - } - } - false - } - serde_json::Value::Array(v) => { - for v in v.iter() { - if contains_null_rec(v, key) { - return true; - } - } - false - } - _ => false, - } -} - pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); From 1a9c58a7abe9820a716b49d5f869e9bd4162b8db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 Mar 2023 16:56:44 +0100 Subject: [PATCH 19/21] Fix a bug with the new flattening rules --- .../extract/extract_docid_word_positions.rs | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..ac5148363 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -107,7 +107,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { - Value::Null => false, + Value::Null | Value::Object(_) => false, Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), Value::Number(number) => write!(output, "{}", number).is_ok(), Value::String(string) => write!(output, "{}", string).is_ok(), @@ -122,23 +122,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st // check that at least one value was written count != 0 } - Value::Object(object) => { - let mut buffer = String::new(); - let mut count = 0; - for (key, value) in object { - buffer.clear(); - let _ = write!(&mut buffer, "{}: ", key); - if inner(value, &mut buffer) { - buffer.push_str(". "); - // We write the "key: value. " pair only when - // we are sure that the value can be written. - output.push_str(&buffer); - count += 1; - } - } - // check that at least one value was written - count != 0 - } } } From cf34d1c95f33d7ec6adeb1180901698e4d6ea916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 15 Mar 2023 16:57:09 +0100 Subject: [PATCH 20/21] Fix a test that forget to match a Null value --- milli/tests/search/mod.rs | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 23744c005..7f072ef95 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -212,13 +212,13 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if matches!(filter, "opt1.opt2 IS NULL") { if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + if !opt1.is_null() { + id = contains_null_rec(opt1, "opt2").then(|| document.id.clone()); + } } } else if matches!(filter, "opt1 IS EMPTY" | "NOT opt1 IS NOT EMPTY") { - id = document - .opt1 - .as_ref() - .map_or(false, |v| is_empty_value(v)) - .then(|| document.id.clone()); + id = document.opt1.as_ref().map_or(false, is_empty_value).then(|| document.id.clone()); } else if matches!(filter, "NOT opt1 IS EMPTY" | "opt1 IS NOT EMPTY") { id = document .opt1 @@ -226,7 +226,7 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { .map_or(true, |v| !is_empty_value(v)) .then(|| document.id.clone()); } else if matches!(filter, "opt1.opt2 IS EMPTY") { - if document.opt1opt2.as_ref().map_or(false, |v| is_empty_value(v)) { + if document.opt1opt2.as_ref().map_or(false, is_empty_value) { id = Some(document.id.clone()); } } else if matches!( @@ -273,6 +273,28 @@ pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { } } +pub fn contains_null_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key && v.is_null() || contains_null_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_null_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); From a8531053a0083ba9673d8470a935a70407fafccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 Mar 2023 11:09:20 +0100 Subject: [PATCH 21/21] Make sure the parser reject invalid syntax --- filter-parser/src/lib.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 69eb6700f..640009983 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -745,6 +745,39 @@ pub mod tests { Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS "###); + + insta::assert_display_snapshot!(p(r#"value NULL"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`. + 1:11 value NULL + "###); + insta::assert_display_snapshot!(p(r#"value NOT NULL"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`. + 1:15 value NOT NULL + "###); + insta::assert_display_snapshot!(p(r#"value EMPTY"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`. + 1:12 value EMPTY + "###); + insta::assert_display_snapshot!(p(r#"value NOT EMPTY"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`. + 1:16 value NOT EMPTY + "###); + insta::assert_display_snapshot!(p(r#"value IS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS`. + 1:9 value IS + "###); + insta::assert_display_snapshot!(p(r#"value IS NOT"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`. + 1:13 value IS NOT + "###); + insta::assert_display_snapshot!(p(r#"value IS EXISTS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`. + 1:16 value IS EXISTS + "###); + insta::assert_display_snapshot!(p(r#"value IS NOT EXISTS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`. + 1:20 value IS NOT EXISTS + "###); } #[test]