mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-08 20:44:30 +01:00
Use new incremental facet indexing and enable sanity checks in debug
This commit is contained in:
parent
5d219587b8
commit
7a9290aaae
@ -36,6 +36,8 @@ use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
|||||||
use crate::progress::Progress;
|
use crate::progress::Progress;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
|
use crate::update::facet::new_incremental::FacetsUpdateIncremental;
|
||||||
|
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
use crate::update::new::extract::EmbeddingExtractor;
|
use crate::update::new::extract::EmbeddingExtractor;
|
||||||
use crate::update::new::merger::merge_and_send_rtree;
|
use crate::update::new::merger::merge_and_send_rtree;
|
||||||
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
||||||
@ -203,6 +205,7 @@ where
|
|||||||
caches,
|
caches,
|
||||||
FacetDatabases::new(index),
|
FacetDatabases::new(index),
|
||||||
index,
|
index,
|
||||||
|
&rtxn,
|
||||||
extractor_sender.facet_docids(),
|
extractor_sender.facet_docids(),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -735,27 +738,66 @@ fn compute_facet_search_database(
|
|||||||
fn compute_facet_level_database(
|
fn compute_facet_level_database(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
facet_field_ids_delta: FacetFieldIdsDelta,
|
mut facet_field_ids_delta: FacetFieldIdsDelta,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() {
|
for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
|
||||||
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
FacetsUpdateBulk::new_not_updating_level_0(
|
match delta {
|
||||||
index,
|
super::merger::FacetFieldIdDelta::Bulk => {
|
||||||
modified_facet_string_ids,
|
tracing::debug!(%fid, "bulk string facet processing");
|
||||||
FacetType::String,
|
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
|
||||||
)
|
.execute(wtxn)?
|
||||||
.execute(wtxn)?;
|
}
|
||||||
|
super::merger::FacetFieldIdDelta::Incremental(delta_data) => {
|
||||||
|
tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
|
||||||
|
FacetsUpdateIncremental::new(
|
||||||
|
index,
|
||||||
|
FacetType::String,
|
||||||
|
fid,
|
||||||
|
delta_data,
|
||||||
|
FACET_GROUP_SIZE,
|
||||||
|
FACET_MIN_LEVEL_SIZE,
|
||||||
|
FACET_MAX_GROUP_SIZE,
|
||||||
|
)
|
||||||
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() {
|
|
||||||
|
for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
|
||||||
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
FacetsUpdateBulk::new_not_updating_level_0(
|
match delta {
|
||||||
|
super::merger::FacetFieldIdDelta::Bulk => {
|
||||||
|
tracing::debug!(%fid, "bulk number facet processing");
|
||||||
|
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
|
||||||
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
super::merger::FacetFieldIdDelta::Incremental(delta_data) => {
|
||||||
|
tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
|
||||||
|
FacetsUpdateIncremental::new(
|
||||||
|
index,
|
||||||
|
FacetType::Number,
|
||||||
|
fid,
|
||||||
|
delta_data,
|
||||||
|
FACET_GROUP_SIZE,
|
||||||
|
FACET_MIN_LEVEL_SIZE,
|
||||||
|
FACET_MAX_GROUP_SIZE,
|
||||||
|
)
|
||||||
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug_assert!(crate::update::facet::sanity_checks(
|
||||||
index,
|
index,
|
||||||
modified_facet_number_ids,
|
wtxn,
|
||||||
|
fid,
|
||||||
FacetType::Number,
|
FacetType::Number,
|
||||||
|
FACET_GROUP_SIZE as usize,
|
||||||
|
FACET_MIN_LEVEL_SIZE as usize,
|
||||||
|
FACET_MAX_GROUP_SIZE as usize,
|
||||||
)
|
)
|
||||||
.execute(wtxn)?;
|
.is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
|
|
||||||
use hashbrown::HashSet;
|
use hashbrown::HashMap;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{Database, RoTxn};
|
use heed::{Database, RoTxn};
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@ -12,6 +12,7 @@ use super::extract::{
|
|||||||
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
||||||
FacetKind, GeoExtractorData,
|
FacetKind, GeoExtractorData,
|
||||||
};
|
};
|
||||||
|
use crate::update::facet::new_incremental::{FacetFieldIdChange, FacetFieldIdOperation};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||||
@ -100,23 +101,34 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||||||
mut caches: Vec<BalancedCaches<'extractor>>,
|
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||||
database: FacetDatabases,
|
database: FacetDatabases,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
|
rtxn: &RoTxn,
|
||||||
docids_sender: FacetDocidsSender,
|
docids_sender: FacetDocidsSender,
|
||||||
) -> Result<FacetFieldIdsDelta> {
|
) -> Result<FacetFieldIdsDelta> {
|
||||||
|
let max_string_count = (index.facet_id_string_docids.len(rtxn)? / 500) as usize;
|
||||||
|
let max_number_count = (index.facet_id_f64_docids.len(rtxn)? / 500) as usize;
|
||||||
|
let max_string_count = max_string_count.clamp(1000, 100_000);
|
||||||
|
let max_number_count = max_number_count.clamp(1000, 100_000);
|
||||||
transpose_and_freeze_caches(&mut caches)?
|
transpose_and_freeze_caches(&mut caches)?
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map(|frozen| {
|
.map(|frozen| {
|
||||||
let mut facet_field_ids_delta = FacetFieldIdsDelta::default();
|
let mut facet_field_ids_delta =
|
||||||
|
FacetFieldIdsDelta::new(max_string_count, max_number_count);
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||||
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
||||||
match merge_cbo_bitmaps(current, del, add)? {
|
match merge_cbo_bitmaps(current, del, add)? {
|
||||||
Operation::Write(bitmap) => {
|
Operation::Write(bitmap) => {
|
||||||
facet_field_ids_delta.register_from_key(key);
|
let operation = if current.is_some() {
|
||||||
|
FacetFieldIdOperation::InPlace
|
||||||
|
} else {
|
||||||
|
FacetFieldIdOperation::Insert
|
||||||
|
};
|
||||||
|
facet_field_ids_delta.register_from_key(key, operation);
|
||||||
docids_sender.write(key, &bitmap)?;
|
docids_sender.write(key, &bitmap)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Operation::Delete => {
|
Operation::Delete => {
|
||||||
facet_field_ids_delta.register_from_key(key);
|
facet_field_ids_delta.register_from_key(key, FacetFieldIdOperation::Remove);
|
||||||
docids_sender.delete(key)?;
|
docids_sender.delete(key)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -126,7 +138,10 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||||||
|
|
||||||
Ok(facet_field_ids_delta)
|
Ok(facet_field_ids_delta)
|
||||||
})
|
})
|
||||||
.reduce(|| Ok(FacetFieldIdsDelta::default()), |lhs, rhs| Ok(lhs?.merge(rhs?)))
|
.reduce(
|
||||||
|
|| Ok(FacetFieldIdsDelta::new(max_string_count, max_number_count)),
|
||||||
|
|lhs, rhs| Ok(lhs?.merge(rhs?)),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FacetDatabases<'a> {
|
pub struct FacetDatabases<'a> {
|
||||||
@ -155,60 +170,141 @@ impl<'a> FacetDatabases<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug)]
|
||||||
|
pub enum FacetFieldIdDelta {
|
||||||
|
Bulk,
|
||||||
|
Incremental(Vec<FacetFieldIdChange>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetFieldIdDelta {
|
||||||
|
fn push(&mut self, facet_value: &[u8], operation: FacetFieldIdOperation, max_count: usize) {
|
||||||
|
*self = match std::mem::replace(self, FacetFieldIdDelta::Bulk) {
|
||||||
|
FacetFieldIdDelta::Bulk => FacetFieldIdDelta::Bulk,
|
||||||
|
FacetFieldIdDelta::Incremental(mut v) => {
|
||||||
|
if v.len() >= max_count {
|
||||||
|
FacetFieldIdDelta::Bulk
|
||||||
|
} else {
|
||||||
|
v.push(FacetFieldIdChange { facet_value: facet_value.into(), operation });
|
||||||
|
FacetFieldIdDelta::Incremental(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge(&mut self, rhs: Option<Self>, max_count: usize) {
|
||||||
|
let Some(rhs) = rhs else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
*self = match (std::mem::replace(self, FacetFieldIdDelta::Bulk), rhs) {
|
||||||
|
(FacetFieldIdDelta::Bulk, _) | (_, FacetFieldIdDelta::Bulk) => FacetFieldIdDelta::Bulk,
|
||||||
|
(
|
||||||
|
FacetFieldIdDelta::Incremental(mut left),
|
||||||
|
FacetFieldIdDelta::Incremental(mut right),
|
||||||
|
) => {
|
||||||
|
if left.len() + right.len() >= max_count {
|
||||||
|
FacetFieldIdDelta::Bulk
|
||||||
|
} else {
|
||||||
|
left.append(&mut right);
|
||||||
|
FacetFieldIdDelta::Incremental(left)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct FacetFieldIdsDelta {
|
pub struct FacetFieldIdsDelta {
|
||||||
/// The field ids that have been modified
|
/// The field ids that have been modified
|
||||||
modified_facet_string_ids: HashSet<FieldId>,
|
modified_facet_string_ids: HashMap<FieldId, FacetFieldIdDelta, rustc_hash::FxBuildHasher>,
|
||||||
modified_facet_number_ids: HashSet<FieldId>,
|
modified_facet_number_ids: HashMap<FieldId, FacetFieldIdDelta, rustc_hash::FxBuildHasher>,
|
||||||
|
max_string_count: usize,
|
||||||
|
max_number_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetFieldIdsDelta {
|
impl FacetFieldIdsDelta {
|
||||||
fn register_facet_string_id(&mut self, field_id: FieldId) {
|
pub fn new(max_string_count: usize, max_number_count: usize) -> Self {
|
||||||
self.modified_facet_string_ids.insert(field_id);
|
Self {
|
||||||
|
max_string_count,
|
||||||
|
max_number_count,
|
||||||
|
modified_facet_string_ids: Default::default(),
|
||||||
|
modified_facet_number_ids: Default::default(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_facet_number_id(&mut self, field_id: FieldId) {
|
fn register_facet_string_id(
|
||||||
self.modified_facet_number_ids.insert(field_id);
|
&mut self,
|
||||||
|
field_id: FieldId,
|
||||||
|
facet_value: &[u8],
|
||||||
|
operation: FacetFieldIdOperation,
|
||||||
|
) {
|
||||||
|
self.modified_facet_string_ids
|
||||||
|
.entry(field_id)
|
||||||
|
.or_insert(FacetFieldIdDelta::Incremental(Default::default()))
|
||||||
|
.push(facet_value, operation, self.max_string_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_from_key(&mut self, key: &[u8]) {
|
fn register_facet_number_id(
|
||||||
let (facet_kind, field_id) = self.extract_key_data(key);
|
&mut self,
|
||||||
match facet_kind {
|
field_id: FieldId,
|
||||||
FacetKind::Number => self.register_facet_number_id(field_id),
|
facet_value: &[u8],
|
||||||
FacetKind::String => self.register_facet_string_id(field_id),
|
operation: FacetFieldIdOperation,
|
||||||
|
) {
|
||||||
|
self.modified_facet_number_ids
|
||||||
|
.entry(field_id)
|
||||||
|
.or_insert(FacetFieldIdDelta::Incremental(Default::default()))
|
||||||
|
.push(facet_value, operation, self.max_number_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn register_from_key(&mut self, key: &[u8], operation: FacetFieldIdOperation) {
|
||||||
|
let (facet_kind, field_id, facet_value) = self.extract_key_data(key);
|
||||||
|
match (facet_kind, facet_value) {
|
||||||
|
(FacetKind::Number, Some(facet_value)) => {
|
||||||
|
self.register_facet_number_id(field_id, facet_value, operation)
|
||||||
|
}
|
||||||
|
(FacetKind::String, Some(facet_value)) => {
|
||||||
|
self.register_facet_string_id(field_id, facet_value, operation)
|
||||||
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key_data(&self, key: &[u8]) -> (FacetKind, FieldId) {
|
fn extract_key_data<'key>(&self, key: &'key [u8]) -> (FacetKind, FieldId, Option<&'key [u8]>) {
|
||||||
let facet_kind = FacetKind::from(key[0]);
|
let facet_kind = FacetKind::from(key[0]);
|
||||||
let field_id = FieldId::from_be_bytes([key[1], key[2]]);
|
let field_id = FieldId::from_be_bytes([key[1], key[2]]);
|
||||||
(facet_kind, field_id)
|
let facet_value = if key.len() >= 4 {
|
||||||
|
// level is also stored in the key at [3] (always 0)
|
||||||
|
Some(&key[4..])
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
(facet_kind, field_id, facet_value)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn modified_facet_string_ids(&self) -> Option<Vec<FieldId>> {
|
pub fn consume_facet_string_delta(
|
||||||
if self.modified_facet_string_ids.is_empty() {
|
&mut self,
|
||||||
None
|
) -> impl Iterator<Item = (FieldId, FacetFieldIdDelta)> + '_ {
|
||||||
} else {
|
self.modified_facet_string_ids.drain()
|
||||||
Some(self.modified_facet_string_ids.iter().copied().collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn modified_facet_number_ids(&self) -> Option<Vec<FieldId>> {
|
pub fn consume_facet_number_delta(
|
||||||
if self.modified_facet_number_ids.is_empty() {
|
&mut self,
|
||||||
None
|
) -> impl Iterator<Item = (FieldId, FacetFieldIdDelta)> + '_ {
|
||||||
} else {
|
self.modified_facet_number_ids.drain()
|
||||||
Some(self.modified_facet_number_ids.iter().copied().collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge(mut self, rhs: Self) -> Self {
|
pub fn merge(mut self, rhs: Self) -> Self {
|
||||||
let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs;
|
// rhs.max_xx_count is assumed to be equal to self.max_xx_count, and so gets unused
|
||||||
modified_facet_number_ids.into_iter().for_each(|fid| {
|
let Self { modified_facet_number_ids, modified_facet_string_ids, .. } = rhs;
|
||||||
self.modified_facet_number_ids.insert(fid);
|
modified_facet_number_ids.into_iter().for_each(|(fid, mut delta)| {
|
||||||
|
let old_delta = self.modified_facet_number_ids.remove(&fid);
|
||||||
|
delta.merge(old_delta, self.max_number_count);
|
||||||
|
self.modified_facet_number_ids.insert(fid, delta);
|
||||||
});
|
});
|
||||||
modified_facet_string_ids.into_iter().for_each(|fid| {
|
modified_facet_string_ids.into_iter().for_each(|(fid, mut delta)| {
|
||||||
self.modified_facet_string_ids.insert(fid);
|
let old_delta = self.modified_facet_string_ids.remove(&fid);
|
||||||
|
delta.merge(old_delta, self.max_string_count);
|
||||||
|
self.modified_facet_string_ids.insert(fid, delta);
|
||||||
});
|
});
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user