mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Merge remote-tracking branch 'origin/main' into search-refactor
Conflicts | resolution ----------|----------- Cargo.lock | added mimalloc Cargo.toml | took origin/main version milli/src/search/criteria/exactness.rs | deleted after checking it was only clippy changes milli/src/search/query_tree.rs | deleted after checking it was only clippy changes
This commit is contained in:
commit
90bc230820
63 changed files with 1984 additions and 1523 deletions
|
@ -82,6 +82,8 @@ pub mod db_name {
|
|||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
||||
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
|
@ -136,6 +138,10 @@ pub struct Index {
|
|||
|
||||
/// Maps the facet field id and the docids for which this field exists
|
||||
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is set as null
|
||||
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is considered empty
|
||||
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
|
||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||
|
@ -184,6 +190,8 @@ impl Index {
|
|||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
|
||||
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
||||
|
||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
|
@ -212,6 +220,8 @@ impl Index {
|
|||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
|
@ -844,6 +854,30 @@ impl Index {
|
|||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id set as null
|
||||
pub fn null_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id and that is considered empty
|
||||
pub fn empty_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id
|
||||
pub fn exists_faceted_documents_ids(
|
||||
&self,
|
||||
|
|
|
@ -211,6 +211,14 @@ impl<'a> Filter<'a> {
|
|||
Condition::Between { from, to } => {
|
||||
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
|
||||
}
|
||||
Condition::Null => {
|
||||
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_null);
|
||||
}
|
||||
Condition::Empty => {
|
||||
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_empty);
|
||||
}
|
||||
Condition::Exists => {
|
||||
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(exist);
|
||||
|
|
|
@ -276,6 +276,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String {
|
|||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_is_null_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| {
|
||||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_is_empty_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| {
|
||||
&format!("{facet_id:<3} {}", display_bitmap(&docids))
|
||||
})
|
||||
}
|
||||
pub fn snap_facet_id_string_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, facet_id_string_docids, |(
|
||||
FacetGroupKey { field_id, level, left_bound },
|
||||
|
@ -503,6 +513,12 @@ macro_rules! full_snap_of_db {
|
|||
($index:ident, facet_id_exists_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_exists_docids(&$index)
|
||||
}};
|
||||
($index:ident, facet_id_is_null_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_is_null_docids(&$index)
|
||||
}};
|
||||
($index:ident, facet_id_is_empty_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index)
|
||||
}};
|
||||
($index:ident, documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_documents_ids(&$index)
|
||||
}};
|
||||
|
|
|
@ -36,6 +36,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
|
@ -90,6 +92,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
script_language_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
facet_id_is_null_docids.clear(self.wtxn)?;
|
||||
facet_id_is_empty_docids.clear(self.wtxn)?;
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
|
|
|
@ -247,6 +247,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
field_id_docid_facet_strings: _,
|
||||
script_language_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
|
@ -445,12 +447,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
&self.to_delete_docids,
|
||||
)?;
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_exists_docids(
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_exists_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_is_null_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_docids(
|
||||
self.wtxn,
|
||||
facet_id_is_empty_docids,
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||
|
||||
Ok(DetailedDocumentDeletionResult {
|
||||
|
@ -577,7 +593,7 @@ fn remove_docids_from_field_id_docid_facet_value(
|
|||
Ok(all_affected_facet_values)
|
||||
}
|
||||
|
||||
fn remove_docids_from_facet_id_exists_docids<'a, C>(
|
||||
fn remove_docids_from_facet_id_docids<'a, C>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
|
|
|
@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
|
|||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
|
@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
|
|||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ use std::mem::size_of;
|
|||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
|
@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes;
|
|||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
|
@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
|
@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
|
||||
match extract_facet_values(&value) {
|
||||
FilterableValues::Null => {
|
||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
}
|
||||
FilterableValues::Empty => {
|
||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Values { numbers, strings } => {
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
|
||||
let normalised_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
fid_docid_facet_numbers_sorter
|
||||
.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in
|
||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||
{
|
||||
let normalized_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter
|
||||
.insert(&key_buffer, original.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
facet_exists_docids_reader,
|
||||
))
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
|
@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
|||
}
|
||||
}
|
||||
|
||||
let mut facet_number_values = Vec::new();
|
||||
let mut facet_string_values = Vec::new();
|
||||
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
|
||||
|
||||
(facet_number_values, facet_string_values)
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ use rayon::prelude::*;
|
|||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
|
@ -57,28 +57,35 @@ pub(crate) fn data_from_obkv_documents(
|
|||
.collect::<Result<()>>()?;
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(
|
||||
docid_fid_facet_numbers_chunks,
|
||||
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
|
||||
(
|
||||
docid_fid_facet_strings_chunks,
|
||||
(
|
||||
facet_is_null_docids_chunks,
|
||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||
),
|
||||
),
|
||||
),
|
||||
) = result?;
|
||||
|
||||
|
@ -98,6 +105,38 @@ pub(crate) fn data_from_obkv_documents(
|
|||
});
|
||||
}
|
||||
|
||||
// merge facet_is_null_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_empty_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
|
@ -246,7 +285,10 @@ fn send_and_extract_flattened_documents_data(
|
|||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
||||
),
|
||||
),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
|
@ -292,11 +334,13 @@ fn send_and_extract_flattened_documents_data(
|
|||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let (
|
||||
let ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk,
|
||||
docid_fid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
) = extract_fid_docid_facet_values(
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
faceted_fields,
|
||||
|
@ -320,7 +364,13 @@ fn send_and_extract_flattened_documents_data(
|
|||
|
||||
Ok((
|
||||
docid_fid_facet_numbers_chunk,
|
||||
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
|
||||
(
|
||||
docid_fid_facet_strings_chunk,
|
||||
(
|
||||
fid_facet_is_null_docids_chunk,
|
||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||
),
|
||||
),
|
||||
))
|
||||
},
|
||||
);
|
||||
|
|
|
@ -1779,6 +1779,187 @@ mod tests {
|
|||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_check_is_null_database() {
|
||||
let content = || {
|
||||
documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"colour": null,
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"colour": [null], // must not be returned
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"colour": {
|
||||
"green": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"colour": {
|
||||
"green": {
|
||||
"blue": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"colour": 0,
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"colour": []
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"colour": {}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"colour": [1]
|
||||
},
|
||||
{
|
||||
"id": 13
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"colour": {
|
||||
"green": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"colour": {
|
||||
"green": {
|
||||
"blue": []
|
||||
}
|
||||
}
|
||||
}
|
||||
])
|
||||
};
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
|
||||
|
||||
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||
let colour_blue_id =
|
||||
index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
|
||||
|
||||
let bitmap_null_colour =
|
||||
index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
|
||||
assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
|
||||
|
||||
let bitmap_colour_green = index
|
||||
.facet_id_is_null_docids
|
||||
.get(&rtxn, &BEU16::new(colour_green_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
|
||||
|
||||
let bitmap_colour_blue = index
|
||||
.facet_id_is_null_docids
|
||||
.get(&rtxn, &BEU16::new(colour_blue_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("colour"));
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
check_ok(&index);
|
||||
|
||||
let index = TempIndex::new();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
index.add_documents(content()).unwrap();
|
||||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_check_is_empty_database() {
|
||||
let content = || {
|
||||
documents!([
|
||||
{"id": 0, "tags": null },
|
||||
{"id": 1, "tags": [null] },
|
||||
{"id": 2, "tags": [] },
|
||||
{"id": 3, "tags": ["hello","world"] },
|
||||
{"id": 4, "tags": [""] },
|
||||
{"id": 5 },
|
||||
{"id": 6, "tags": {} },
|
||||
{"id": 7, "tags": {"green": "cool"} },
|
||||
{"id": 8, "tags": {"green": ""} },
|
||||
{"id": 9, "tags": "" },
|
||||
{"id": 10, "tags": { "green": null } },
|
||||
{"id": 11, "tags": { "green": { "blue": null } } },
|
||||
{"id": 12, "tags": { "green": { "blue": [] } } }
|
||||
])
|
||||
};
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
|
||||
|
||||
let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
|
||||
let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
|
||||
let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
|
||||
|
||||
let bitmap_empty_tags =
|
||||
index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap();
|
||||
assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
|
||||
|
||||
let bitmap_tags_green = index
|
||||
.facet_id_is_empty_docids
|
||||
.get(&rtxn, &BEU16::new(tags_green_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
|
||||
|
||||
let bitmap_tags_blue = index
|
||||
.facet_id_is_empty_docids
|
||||
.get(&rtxn, &BEU16::new(tags_blue_id))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("tags"));
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
check_ok(&index);
|
||||
|
||||
let index = TempIndex::new();
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(faceted_fields.clone());
|
||||
})
|
||||
.unwrap();
|
||||
index.add_documents(content()).unwrap();
|
||||
check_ok(&index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn primary_key_must_not_contain_floats() {
|
||||
let index = TempIndex::new_with_map_size(4096 * 100);
|
||||
|
|
|
@ -40,6 +40,8 @@ pub(crate) enum TypedChunk {
|
|||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
||||
GeoPoints(grenad::Reader<File>),
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||
}
|
||||
|
@ -173,6 +175,28 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_is_null_docids,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_is_empty_docids,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue