3571: Introduce two filters to select documents with `null` and empty fields r=irevoire a=Kerollmops

# Pull Request

## Related issue
This PR implements the `X IS NULL`, `X IS NOT NULL`, `X IS EMPTY`, `X IS NOT EMPTY` filters that [this comment](https://github.com/meilisearch/product/discussions/539#discussioncomment-5115884) is describing in a very detailed manner.

## What does this PR do?

### `IS NULL` and `IS NOT NULL`

This PR will be exposed as a prototype for now. Below is the copy/pasted version of a spec that defines this filter.

- `IS NULL` matches fields that `EXISTS` AND `= IS NULL`
- `IS NOT NULL` matches fields that `NOT EXISTS` OR `!= IS NULL`

1. `{"name": "A", "price": null}`
2. `{"name": "A", "price": 10}`
3. `{"name": "A"}`

`price IS NULL` would match 1
`price IS NOT NULL` or `NOT price IS NULL` would match 2,3
`price EXISTS` would match 1, 2
`price NOT EXISTS` or `NOT price EXISTS` would match 3

common query : `(price EXISTS) AND (price IS NOT NULL)` would match 2

### `IS EMPTY` and `IS NOT EMPTY`

- `IS EMPTY` matches Array `[]`, Object `{}`, or String `""` fields that `EXISTS` and are empty
- `IS NOT EMPTY` matches fields that `NOT EXISTS` OR are not empty.

1. `{"name": "A", "tags": null}`
2. `{"name": "A", "tags": [null]}`
3. `{"name": "A", "tags": []}`
4. `{"name": "A", "tags": ["hello","world"]}`
5. `{"name": "A", "tags": [""]}`
6. `{"name": "A"}`
7. `{"name": "A", "tags": {}}`
8. `{"name": "A", "tags": {"t1":"v1"}}`
9. `{"name": "A", "tags": {"t1":""}}`
10. `{"name": "A", "tags": ""}`

`tags IS EMPTY` would match 3,7,10
`tags IS NOT EMPTY` or `NOT tags IS EMPTY` would match 1,2,4,5,6,8,9
`tags IS NULL` would match 1
`tags IS NOT NULL` or `NOT tags IS NULL` would match 2,3,4,5,6,7,8,9,10
`tags EXISTS` would match 1,2,3,4,5,7,8,9,10
`tags NOT EXISTS` or `NOT tags EXISTS` would match 6

common query : `(tags EXISTS) AND (tags IS NOT NULL) AND (tags IS NOT EMPTY)` would match 2,4,5,8,9

## What should the reviewer do?

- Check that I tested the filters
- Check that I deleted the ids of the documents when deleting documents


Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2023-04-27 13:14:00 +00:00 committed by GitHub
commit 414b3fae89
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 730 additions and 118 deletions

View file

@ -80,6 +80,8 @@ pub mod db_name {
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -129,6 +131,10 @@ pub struct Index {
/// Maps the facet field id and the docids for which this field exists
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is set as null
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is considered empty
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
@ -153,7 +159,7 @@ impl Index {
) -> Result<Index> {
use db_name::*;
options.max_dbs(19);
options.max_dbs(21);
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?;
@ -175,6 +181,8 @@ impl Index {
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
@ -201,6 +209,8 @@ impl Index {
facet_id_f64_docids,
facet_id_string_docids,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
@ -833,6 +843,30 @@ impl Index {
}
}
/// Retrieve all the documents which contain this field id set as null
pub fn null_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id and that is considered empty
pub fn empty_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id
pub fn exists_faceted_documents_ids(
&self,

View file

@ -211,6 +211,14 @@ impl<'a> Filter<'a> {
Condition::Between { from, to } => {
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
}
Condition::Null => {
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_null);
}
Condition::Empty => {
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_empty);
}
Condition::Exists => {
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
return Ok(exist);

View file

@ -271,6 +271,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String {
&format!("{facet_id:<3} {}", display_bitmap(&docids))
})
}
pub fn snap_facet_id_is_null_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| {
&format!("{facet_id:<3} {}", display_bitmap(&docids))
})
}
pub fn snap_facet_id_is_empty_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| {
&format!("{facet_id:<3} {}", display_bitmap(&docids))
})
}
pub fn snap_facet_id_string_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, facet_id_string_docids, |(
FacetGroupKey { field_id, level, left_bound },
@ -495,6 +505,12 @@ macro_rules! full_snap_of_db {
($index:ident, facet_id_exists_docids) => {{
$crate::snapshot_tests::snap_facet_id_exists_docids(&$index)
}};
($index:ident, facet_id_is_null_docids) => {{
$crate::snapshot_tests::snap_facet_id_is_null_docids(&$index)
}};
($index:ident, facet_id_is_empty_docids) => {{
$crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index)
}};
($index:ident, documents_ids) => {{
$crate::snapshot_tests::snap_documents_ids(&$index)
}};

View file

@ -34,6 +34,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_f64_docids,
facet_id_string_docids,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
@ -86,6 +88,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?;
facet_id_is_empty_docids.clear(self.wtxn)?;
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;

View file

@ -245,6 +245,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
field_id_docid_facet_strings: _,
script_language_docids,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
documents,
} = self.index;
@ -517,12 +519,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_exists_docids(
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_exists_docids,
&self.to_delete_docids,
)?;
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_is_null_docids,
&self.to_delete_docids,
)?;
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_is_empty_docids,
&self.to_delete_docids,
)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
Ok(DetailedDocumentDeletionResult {
@ -625,7 +641,7 @@ fn remove_docids_from_field_id_docid_facet_value(
Ok(all_affected_facet_values)
}
fn remove_docids_from_facet_id_exists_docids<'a, C>(
fn remove_docids_from_facet_id_docids<'a, C>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,

View file

@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write;
match value {
Value::Null => false,
Value::Null | Value::Object(_) => false,
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
Value::Number(number) => write!(output, "{}", number).is_ok(),
Value::String(string) => write!(output, "{}", string).is_ok(),
@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
// check that at least one value was written
count != 0
}
Value::Object(object) => {
let mut buffer = String::new();
let mut count = 0;
for (key, value) in object {
buffer.clear();
let _ = write!(&mut buffer, "{}: ", key);
if inner(value, &mut buffer) {
buffer.push_str(". ");
// We write the "key: value. " pair only when
// we are sure that the value can be written.
output.push_str(&buffer);
count += 1;
}
}
// check that at least one value was written
count != 0
}
}
}

View file

@ -7,7 +7,7 @@ use std::mem::size_of;
use heed::zerocopy::AsBytes;
use heed::BytesEncode;
use roaring::RoaringBitmap;
use serde_json::Value;
use serde_json::{from_slice, Value};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError;
@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes;
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
/// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
}
/// Extracts the facet values of each faceted field of each document.
///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter(
@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
);
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
// For the other extraction tasks, prefix the key with the field_id and the document_id
key_buffer.extend_from_slice(docid_bytes);
let value =
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
let (numbers, strings) = extract_facet_values(&value);
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
match extract_facet_values(&value) {
FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document);
}
}
FilterableValues::Empty => {
facet_is_empty_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// insert normalized and original facet string in sorter
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
let normalised_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
fid_docid_facet_numbers_sorter
.insert(&key_buffer, ().as_bytes())?;
}
}
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes());
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
// insert normalized and original facet string in sorter
for (normalized, original) in
strings.into_iter().filter(|(n, _)| !n.is_empty())
{
let normalized_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
fid_docid_facet_strings_sorter
.insert(&key_buffer, original.as_bytes())?;
}
}
}
}
}
@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
}
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
Ok((
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
facet_exists_docids_reader,
))
let mut facet_is_null_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_null_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
let mut facet_is_empty_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
Ok(ExtractedFacetValues {
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
})
}
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
/// Represent what a document field contains.
enum FilterableValues {
/// Corresponds to the JSON `null` value.
Null,
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
Empty,
/// Represents all the numbers and strings values found in this document field.
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
}
fn extract_facet_values(value: &Value) -> FilterableValues {
fn inner_extract_facet_values(
value: &Value,
can_recurse: bool,
@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
}
}
let mut facet_number_values = Vec::new();
let mut facet_string_values = Vec::new();
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
(facet_number_values, facet_string_values)
match value {
Value::Null => FilterableValues::Null,
Value::String(s) if s.is_empty() => FilterableValues::Empty,
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
otherwise => {
let mut numbers = Vec::new();
let mut strings = Vec::new();
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
FilterableValues::Values { numbers, strings }
}
}
}

View file

@ -18,7 +18,7 @@ use rayon::prelude::*;
use self::extract_docid_word_positions::extract_docid_word_positions;
use self::extract_facet_number_docids::extract_facet_number_docids;
use self::extract_facet_string_docids::extract_facet_string_docids;
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_word_docids::extract_word_docids;
@ -55,28 +55,35 @@ pub(crate) fn data_from_obkv_documents(
.collect::<Result<()>>()?;
#[allow(clippy::type_complexity)]
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
.par_bridge()
.map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
max_positions_per_attributes,
)
})
.collect();
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
flattened_obkv_chunks
.par_bridge()
.map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
max_positions_per_attributes,
)
})
.collect();
let (
docid_word_positions_chunks,
(
docid_fid_facet_numbers_chunks,
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
(
docid_fid_facet_strings_chunks,
(
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
),
),
),
) = result?;
@ -96,6 +103,38 @@ pub(crate) fn data_from_obkv_documents(
});
}
// merge facet_is_null_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-null-docids");
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
// merge facet_is_empty_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-empty-docids");
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(),
indexer,
@ -235,7 +274,10 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
(
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
),
),
)> {
let flattened_documents_chunk =
@ -281,11 +323,13 @@ fn send_and_extract_flattened_documents_data(
Ok(docid_word_positions_chunk)
},
|| {
let (
let ExtractedFacetValues {
docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk,
) = extract_fid_docid_facet_values(
} = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(),
indexer,
faceted_fields,
@ -309,7 +353,13 @@ fn send_and_extract_flattened_documents_data(
Ok((
docid_fid_facet_numbers_chunk,
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
(
docid_fid_facet_strings_chunk,
(
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
),
),
))
},
);

View file

@ -1757,6 +1757,187 @@ mod tests {
check_ok(&index);
}
#[test]
fn index_documents_check_is_null_database() {
let content = || {
documents!([
{
"id": 0,
"colour": null,
},
{
"id": 1,
"colour": [null], // must not be returned
},
{
"id": 6,
"colour": {
"green": null
}
},
{
"id": 7,
"colour": {
"green": {
"blue": null
}
}
},
{
"id": 8,
"colour": 0,
},
{
"id": 9,
"colour": []
},
{
"id": 10,
"colour": {}
},
{
"id": 12,
"colour": [1]
},
{
"id": 13
},
{
"id": 14,
"colour": {
"green": 1
}
},
{
"id": 15,
"colour": {
"green": {
"blue": []
}
}
}
])
};
let check_ok = |index: &Index| {
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
let colour_blue_id =
index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
let bitmap_null_colour =
index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
let bitmap_colour_green = index
.facet_id_is_null_docids
.get(&rtxn, &BEU16::new(colour_green_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
let bitmap_colour_blue = index
.facet_id_is_null_docids
.get(&rtxn, &BEU16::new(colour_blue_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
};
let faceted_fields = hashset!(S("colour"));
let index = TempIndex::new();
index.add_documents(content()).unwrap();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
check_ok(&index);
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
index.add_documents(content()).unwrap();
check_ok(&index);
}
#[test]
fn index_documents_check_is_empty_database() {
let content = || {
documents!([
{"id": 0, "tags": null },
{"id": 1, "tags": [null] },
{"id": 2, "tags": [] },
{"id": 3, "tags": ["hello","world"] },
{"id": 4, "tags": [""] },
{"id": 5 },
{"id": 6, "tags": {} },
{"id": 7, "tags": {"green": "cool"} },
{"id": 8, "tags": {"green": ""} },
{"id": 9, "tags": "" },
{"id": 10, "tags": { "green": null } },
{"id": 11, "tags": { "green": { "blue": null } } },
{"id": 12, "tags": { "green": { "blue": [] } } }
])
};
let check_ok = |index: &Index| {
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();
assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
let bitmap_empty_tags =
index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap();
assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
let bitmap_tags_green = index
.facet_id_is_empty_docids
.get(&rtxn, &BEU16::new(tags_green_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
let bitmap_tags_blue = index
.facet_id_is_empty_docids
.get(&rtxn, &BEU16::new(tags_blue_id))
.unwrap()
.unwrap();
assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
};
let faceted_fields = hashset!(S("tags"));
let index = TempIndex::new();
index.add_documents(content()).unwrap();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
check_ok(&index);
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_filterable_fields(faceted_fields.clone());
})
.unwrap();
index.add_documents(content()).unwrap();
check_ok(&index);
}
#[test]
fn primary_key_must_not_contain_floats() {
let index = TempIndex::new_with_map_size(4096 * 100);

View file

@ -39,6 +39,8 @@ pub(crate) enum TypedChunk {
FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<File>),
FieldIdFacetExistsDocids(grenad::Reader<File>),
FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
}
@ -161,6 +163,28 @@ pub(crate) fn write_typed_chunk_into_index(
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
append_entries_into_database(
facet_id_is_null_docids,
&index.facet_id_is_null_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
append_entries_into_database(
facet_id_is_empty_docids,
&index.facet_id_is_empty_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
append_entries_into_database(
word_pair_proximity_docids_iter,