mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Merge #3571
3571: Introduce two filters to select documents with `null` and empty fields r=irevoire a=Kerollmops # Pull Request ## Related issue This PR implements the `X IS NULL`, `X IS NOT NULL`, `X IS EMPTY`, `X IS NOT EMPTY` filters that [this comment](https://github.com/meilisearch/product/discussions/539#discussioncomment-5115884) is describing in a very detailed manner. ## What does this PR do? ### `IS NULL` and `IS NOT NULL` This PR will be exposed as a prototype for now. Below is the copy/pasted version of a spec that defines this filter. - `IS NULL` matches fields that `EXISTS` AND `= IS NULL` - `IS NOT NULL` matches fields that `NOT EXISTS` OR `!= IS NULL` 1. `{"name": "A", "price": null}` 2. `{"name": "A", "price": 10}` 3. `{"name": "A"}` `price IS NULL` would match 1 `price IS NOT NULL` or `NOT price IS NULL` would match 2,3 `price EXISTS` would match 1, 2 `price NOT EXISTS` or `NOT price EXISTS` would match 3 common query : `(price EXISTS) AND (price IS NOT NULL)` would match 2 ### `IS EMPTY` and `IS NOT EMPTY` - `IS EMPTY` matches Array `[]`, Object `{}`, or String `""` fields that `EXISTS` and are empty - `IS NOT EMPTY` matches fields that `NOT EXISTS` OR are not empty. 1. `{"name": "A", "tags": null}` 2. `{"name": "A", "tags": [null]}` 3. `{"name": "A", "tags": []}` 4. `{"name": "A", "tags": ["hello","world"]}` 5. `{"name": "A", "tags": [""]}` 6. `{"name": "A"}` 7. `{"name": "A", "tags": {}}` 8. `{"name": "A", "tags": {"t1":"v1"}}` 9. `{"name": "A", "tags": {"t1":""}}` 10. `{"name": "A", "tags": ""}` `tags IS EMPTY` would match 3,7,10 `tags IS NOT EMPTY` or `NOT tags IS EMPTY` would match 1,2,4,5,6,8,9 `tags IS NULL` would match 1 `tags IS NOT NULL` or `NOT tags IS NULL` would match 2,3,4,5,6,7,8,9,10 `tags EXISTS` would match 1,2,3,4,5,7,8,9,10 `tags NOT EXISTS` or `NOT tags EXISTS` would match 6 common query : `(tags EXISTS) AND (tags IS NOT NULL) AND (tags IS NOT EMPTY)` would match 2,4,5,8,9 ## What should the reviewer do? - Check that I tested the filters - Check that I deleted the ids of the documents when deleting documents Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
414b3fae89
18 changed files with 730 additions and 118 deletions
|
@ -7,7 +7,7 @@ use std::mem::size_of;
|
|||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
|
@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes;
|
|||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
|
@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
|
@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
|
||||
match extract_facet_values(&value) {
|
||||
FilterableValues::Null => {
|
||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
}
|
||||
FilterableValues::Empty => {
|
||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Values { numbers, strings } => {
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
|
||||
let normalised_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
fid_docid_facet_numbers_sorter
|
||||
.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in
|
||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||
{
|
||||
let normalized_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter
|
||||
.insert(&key_buffer, original.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
facet_exists_docids_reader,
|
||||
))
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
|
@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
|||
}
|
||||
}
|
||||
|
||||
let mut facet_number_values = Vec::new();
|
||||
let mut facet_string_values = Vec::new();
|
||||
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
|
||||
|
||||
(facet_number_values, facet_string_values)
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue