mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Merge #3571
3571: Introduce two filters to select documents with `null` and empty fields r=irevoire a=Kerollmops # Pull Request ## Related issue This PR implements the `X IS NULL`, `X IS NOT NULL`, `X IS EMPTY`, `X IS NOT EMPTY` filters that [this comment](https://github.com/meilisearch/product/discussions/539#discussioncomment-5115884) is describing in a very detailed manner. ## What does this PR do? ### `IS NULL` and `IS NOT NULL` This PR will be exposed as a prototype for now. Below is the copy/pasted version of a spec that defines this filter. - `IS NULL` matches fields that `EXISTS` AND `= IS NULL` - `IS NOT NULL` matches fields that `NOT EXISTS` OR `!= IS NULL` 1. `{"name": "A", "price": null}` 2. `{"name": "A", "price": 10}` 3. `{"name": "A"}` `price IS NULL` would match 1 `price IS NOT NULL` or `NOT price IS NULL` would match 2,3 `price EXISTS` would match 1, 2 `price NOT EXISTS` or `NOT price EXISTS` would match 3 common query : `(price EXISTS) AND (price IS NOT NULL)` would match 2 ### `IS EMPTY` and `IS NOT EMPTY` - `IS EMPTY` matches Array `[]`, Object `{}`, or String `""` fields that `EXISTS` and are empty - `IS NOT EMPTY` matches fields that `NOT EXISTS` OR are not empty. 1. `{"name": "A", "tags": null}` 2. `{"name": "A", "tags": [null]}` 3. `{"name": "A", "tags": []}` 4. `{"name": "A", "tags": ["hello","world"]}` 5. `{"name": "A", "tags": [""]}` 6. `{"name": "A"}` 7. `{"name": "A", "tags": {}}` 8. `{"name": "A", "tags": {"t1":"v1"}}` 9. `{"name": "A", "tags": {"t1":""}}` 10. `{"name": "A", "tags": ""}` `tags IS EMPTY` would match 3,7,10 `tags IS NOT EMPTY` or `NOT tags IS EMPTY` would match 1,2,4,5,6,8,9 `tags IS NULL` would match 1 `tags IS NOT NULL` or `NOT tags IS NULL` would match 2,3,4,5,6,7,8,9,10 `tags EXISTS` would match 1,2,3,4,5,7,8,9,10 `tags NOT EXISTS` or `NOT tags EXISTS` would match 6 common query : `(tags EXISTS) AND (tags IS NOT NULL) AND (tags IS NOT EMPTY)` would match 2,4,5,8,9 ## What should the reviewer do? - Check that I tested the filters - Check that I deleted the ids of the documents when deleting documents Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
414b3fae89
18 changed files with 730 additions and 118 deletions
|
@ -80,6 +80,8 @@ pub mod db_name {
|
|||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
||||
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
|
@ -129,6 +131,10 @@ pub struct Index {
|
|||
|
||||
/// Maps the facet field id and the docids for which this field exists
|
||||
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is set as null
|
||||
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is considered empty
|
||||
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
|
||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||
|
@ -153,7 +159,7 @@ impl Index {
|
|||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(19);
|
||||
options.max_dbs(21);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
|
@ -175,6 +181,8 @@ impl Index {
|
|||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
|
||||
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
||||
|
||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
|
@ -201,6 +209,8 @@ impl Index {
|
|||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
|
@ -833,6 +843,30 @@ impl Index {
|
|||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id set as null
|
||||
pub fn null_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id and that is considered empty
|
||||
pub fn empty_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id
|
||||
pub fn exists_faceted_documents_ids(
|
||||
&self,
|
||||
|
|
|
@ -211,6 +211,14 @@ impl<'a> Filter<'a> {
|
|||
Condition::Between { from, to } => {
|
||||
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
|
||||
}
|
||||
Condition::Null => {
|
||||
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_null);
|
||||
}
|
||||
Condition::Empty => {
|
||||
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_empty);
|
||||
}
|
||||
Condition::Exists => {
|
||||
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(exist);
|
||||
|
|
|
@ -271,6 +271,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String {
|
|||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_is_null_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| {
|
||||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_is_empty_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| {
|
||||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_string_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_string_docids, |(
|
||||
FacetGroupKey { field_id, level, left_bound },
|
||||
|
@ -495,6 +505,12 @@ macro_rules! full_snap_of_db {
|
|||
($index:ident, facet_id_exists_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_exists_docids(&$index)
|
||||
}};
|
||||
($index:ident, facet_id_is_null_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_is_null_docids(&$index)
|
||||
}};
|
||||
($index:ident, facet_id_is_empty_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index)
|
||||
}};
|
||||
($index:ident, documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_documents_ids(&$index)
|
||||
}};
|
||||
|
|
|
@ -34,6 +34,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
|
@ -86,6 +88,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
script_language_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
facet_id_is_null_docids.clear(self.wtxn)?;
|
||||
facet_id_is_empty_docids.clear(self.wtxn)?;
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
|
|
|
@ -245,6 +245,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
field_id_docid_facet_strings: _,
|
||||
script_language_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
|
@ -517,12 +519,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
|
||||
drop(iter);
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_exists_docids(
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_exists_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_is_null_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_is_empty_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||
|
||||
Ok(DetailedDocumentDeletionResult {
|
||||
|
@ -625,7 +641,7 @@ fn remove_docids_from_field_id_docid_facet_value(
|
|||
Ok(all_affected_facet_values)
|
||||
}
|
||||
|
||||
fn remove_docids_from_facet_id_exists_docids<'a, C>(
|
||||
fn remove_docids_from_facet_id_docids<'a, C>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
|
|
|
@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
|
|||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
|
@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
|
|||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ use std::mem::size_of;
|
|||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
|
@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes;
|
|||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
|
@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
|
@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
|
||||
match extract_facet_values(&value) {
|
||||
FilterableValues::Null => {
|
||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
}
|
||||
FilterableValues::Empty => {
|
||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Values { numbers, strings } => {
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
|
||||
let normalised_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
fid_docid_facet_numbers_sorter
|
||||
.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in
|
||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||
{
|
||||
let normalized_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter
|
||||
.insert(&key_buffer, original.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
facet_exists_docids_reader,
|
||||
))
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
|
@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
|||
}
|
||||
}
|
||||
|
||||
let mut facet_number_values = Vec::new();
|
||||
let mut facet_string_values = Vec::new();
|
||||
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
|
||||
|
||||
(facet_number_values, facet_string_values)
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ use rayon::prelude::*;
|
|||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
|
@ -55,28 +55,35 @@ pub(crate) fn data_from_obkv_documents(
|
|||
.collect::<Result<()>>()?;
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(
|
||||
docid_fid_facet_numbers_chunks,
|
||||
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
|
||||
(
|
||||
docid_fid_facet_strings_chunks,
|
||||
(
|
||||
facet_is_null_docids_chunks,
|
||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||
),
|
||||
),
|
||||
),
|
||||
) = result?;
|
||||
|
||||
|
@ -96,6 +103,38 @@ pub(crate) fn data_from_obkv_documents(
|
|||
});
|
||||
}
|
||||
|
||||
// merge facet_is_null_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_empty_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
|
@ -235,7 +274,10 @@ fn send_and_extract_flattened_documents_data(
|
|||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
||||
),
|
||||
),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
|
@ -281,11 +323,13 @@ fn send_and_extract_flattened_documents_data(
|
|||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let (
|
||||
let ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk,
|
||||
docid_fid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
) = extract_fid_docid_facet_values(
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
faceted_fields,
|
||||
|
@ -309,7 +353,13 @@ fn send_and_extract_flattened_documents_data(
|
|||
|
||||
Ok((
|
||||
docid_fid_facet_numbers_chunk,
|
||||
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
|
||||
(
|
||||
docid_fid_facet_strings_chunk,
|
||||
(
|
||||
fid_facet_is_null_docids_chunk,
|
||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||
),
|
||||
),
|
||||
))
|
||||
},
|
||||
);
|
||||
|
|
|
@ -1757,6 +1757,187 @@ mod tests {
|
|||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_check_is_null_database() {
|
||||
let content = || {
|
||||
documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"colour": null,
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"colour": [null], // must not be returned
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"colour": {
|
||||
"green": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"colour": {
|
||||
"green": {
|
||||
"blue": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"colour": 0,
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"colour": []
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"colour": {}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"colour": [1]
|
||||
},
|
||||
{
|
||||
"id": 13
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"colour": {
|
||||
"green": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"colour": {
|
||||
"green": {
|
||||
"blue": []
|
||||
}
|
||||
}
|
||||
}
|
||||
])
|
||||
};
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
|
||||
|
||||
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||
let colour_blue_id =
|
||||
index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
|
||||
|
||||
let bitmap_null_colour =
|
||||
index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
|
||||
assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
|
||||
|
||||
let bitmap_colour_green = index
|
||||
.facet_id_is_null_docids
|
||||
.get(&rtxn, &BEU16::new(colour_green_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
|
||||
|
||||
let bitmap_colour_blue = index
|
||||
.facet_id_is_null_docids
|
||||
.get(&rtxn, &BEU16::new(colour_blue_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("colour"));
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
check_ok(&index);
|
||||
|
||||
let index = TempIndex::new();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
index.add_documents(content()).unwrap();
|
||||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_check_is_empty_database() {
|
||||
let content = || {
|
||||
documents!([
|
||||
{"id": 0, "tags": null },
|
||||
{"id": 1, "tags": [null] },
|
||||
{"id": 2, "tags": [] },
|
||||
{"id": 3, "tags": ["hello","world"] },
|
||||
{"id": 4, "tags": [""] },
|
||||
{"id": 5 },
|
||||
{"id": 6, "tags": {} },
|
||||
{"id": 7, "tags": {"green": "cool"} },
|
||||
{"id": 8, "tags": {"green": ""} },
|
||||
{"id": 9, "tags": "" },
|
||||
{"id": 10, "tags": { "green": null } },
|
||||
{"id": 11, "tags": { "green": { "blue": null } } },
|
||||
{"id": 12, "tags": { "green": { "blue": [] } } }
|
||||
])
|
||||
};
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
|
||||
|
||||
let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
|
||||
let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
|
||||
let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
|
||||
|
||||
let bitmap_empty_tags =
|
||||
index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap();
|
||||
assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
|
||||
|
||||
let bitmap_tags_green = index
|
||||
.facet_id_is_empty_docids
|
||||
.get(&rtxn, &BEU16::new(tags_green_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
|
||||
|
||||
let bitmap_tags_blue = index
|
||||
.facet_id_is_empty_docids
|
||||
.get(&rtxn, &BEU16::new(tags_blue_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("tags"));
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
check_ok(&index);
|
||||
|
||||
let index = TempIndex::new();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
index.add_documents(content()).unwrap();
|
||||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn primary_key_must_not_contain_floats() {
|
||||
let index = TempIndex::new_with_map_size(4096 * 100);
|
||||
|
|
|
@ -39,6 +39,8 @@ pub(crate) enum TypedChunk {
|
|||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
||||
GeoPoints(grenad::Reader<File>),
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||
}
|
||||
|
@ -161,6 +163,28 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_is_null_docids,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_is_empty_docids,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue