mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Move crates under a sub folder to clean up the code
This commit is contained in:
parent
30f3c30389
commit
9c1e54a2c8
1062 changed files with 19 additions and 20 deletions
65
crates/milli/src/update/available_documents_ids.rs
Normal file
65
crates/milli/src/update/available_documents_ids.rs
Normal file
|
@ -0,0 +1,65 @@
|
|||
use std::iter::{Chain, FromIterator};
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use roaring::bitmap::{IntoIter, RoaringBitmap};
|
||||
|
||||
pub struct AvailableDocumentsIds {
|
||||
iter: Chain<IntoIter, RangeInclusive<u32>>,
|
||||
}
|
||||
|
||||
impl AvailableDocumentsIds {
|
||||
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||
match docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
available -= docids;
|
||||
|
||||
let iter = match last_id.checked_add(1) {
|
||||
Some(id) => id..=u32::MAX,
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
None => 1..=0, // empty range iterator
|
||||
};
|
||||
|
||||
AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
|
||||
}
|
||||
None => {
|
||||
let empty = RoaringBitmap::new().into_iter();
|
||||
AvailableDocumentsIds { iter: empty.chain(0..=u32::MAX) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for AvailableDocumentsIds {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.iter.next()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = 0..=u32::MAX;
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scattered() {
|
||||
let mut base = RoaringBitmap::new();
|
||||
base.insert(0);
|
||||
base.insert(10);
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
}
|
149
crates/milli/src/update/clear_documents.rs
Normal file
149
crates/milli/src/update/clear_documents.rs
Normal file
|
@ -0,0 +1,149 @@
|
|||
use heed::RwTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::{FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'i> {
|
||||
wtxn: &'t mut RwTxn<'i>,
|
||||
index: &'i Index,
|
||||
}
|
||||
|
||||
impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> ClearDocuments<'t, 'i> {
|
||||
ClearDocuments { wtxn, index }
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip(self),
|
||||
target = "indexing::documents",
|
||||
name = "clear_documents"
|
||||
)]
|
||||
pub fn execute(self) -> Result<u64> {
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
let Index {
|
||||
env: _env,
|
||||
main: _main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
word_prefix_position_docids,
|
||||
word_prefix_fid_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_normalized_string_strings,
|
||||
facet_id_string_fst,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_arroy,
|
||||
embedder_category_id: _,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
let empty_roaring = RoaringBitmap::default();
|
||||
|
||||
// We retrieve the number of documents ids that we are deleting.
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
|
||||
// Remove all user-provided bits from the configs
|
||||
let mut configs = self.index.embedding_configs(self.wtxn)?;
|
||||
for config in configs.iter_mut() {
|
||||
config.user_provided.clear();
|
||||
}
|
||||
self.index.put_embedding_configs(self.wtxn, configs)?;
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
word_fid_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
word_prefix_position_docids.clear(self.wtxn)?;
|
||||
word_prefix_fid_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_normalized_string_strings.clear(self.wtxn)?;
|
||||
facet_id_string_fst.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
facet_id_is_null_docids.clear(self.wtxn)?;
|
||||
facet_id_is_empty_docids.clear(self.wtxn)?;
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
// vector
|
||||
vector_arroy.clear(self.wtxn)?;
|
||||
|
||||
documents.clear(self.wtxn)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
#[test]
|
||||
fn clear_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 20 },
|
||||
{ "id": 1, "name": "kevina" },
|
||||
{ "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Clear all documents from the database.
|
||||
let builder = ClearDocuments::new(&mut wtxn, &index);
|
||||
assert_eq!(builder.execute().unwrap(), 3);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]`
|
||||
assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7);
|
||||
|
||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||
assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty());
|
||||
|
||||
assert!(index.word_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents.is_empty(&rtxn).unwrap());
|
||||
}
|
||||
}
|
140
crates/milli/src/update/del_add.rs
Normal file
140
crates/milli/src/update/del_add.rs
Normal file
|
@ -0,0 +1,140 @@
|
|||
use obkv::Key;
|
||||
|
||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||
|
||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||
///
|
||||
/// Its used in an OBKV to be serialized in grenad files.
|
||||
#[repr(u8)]
|
||||
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||
pub enum DelAdd {
|
||||
Deletion = 0,
|
||||
Addition = 1,
|
||||
}
|
||||
|
||||
impl Key for DelAdd {
|
||||
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||
type BYTES = [u8; Self::BYTES_SIZE];
|
||||
|
||||
fn to_be_bytes(&self) -> Self::BYTES {
|
||||
u8::to_be_bytes(*self as u8)
|
||||
}
|
||||
|
||||
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||
match u8::from_be_bytes(array) {
|
||||
0 => Self::Deletion,
|
||||
1 => Self::Addition,
|
||||
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||
///
|
||||
/// Deletion: put all the values under DelAdd::Deletion
|
||||
/// Addition: put all the values under DelAdd::Addition,
|
||||
/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
|
||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
reader: obkv::KvReader<'_, K>,
|
||||
operation: DelAddOperation,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
into_del_add_obkv_conditional_operation(reader, buffer, |_| operation)
|
||||
}
|
||||
|
||||
/// Akin to the [into_del_add_obkv] function but lets you
|
||||
/// conditionally define the `DelAdd` variant based on the obkv key.
|
||||
pub fn into_del_add_obkv_conditional_operation<K, F>(
|
||||
reader: obkv::KvReader<'_, K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
operation: F,
|
||||
) -> std::io::Result<()>
|
||||
where
|
||||
K: obkv::Key + PartialOrd,
|
||||
F: Fn(K) -> DelAddOperation,
|
||||
{
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for (key, value) in reader.iter() {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let operation = operation(key);
|
||||
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Deletion, value)?;
|
||||
}
|
||||
if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Addition, value)?;
|
||||
}
|
||||
value_writer.finish()?;
|
||||
writer.insert(key, &value_buffer)?;
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum DelAddOperation {
|
||||
Deletion,
|
||||
Addition,
|
||||
DeletionAndAddition,
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||
///
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: &obkv::KvReader<'_, K>,
|
||||
addition: &obkv::KvReader<'_, K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
value_buffer.clear();
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Right((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Both((k, deletion), (_, addition)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd<'_>) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
|
||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||
/// This is useful when there are no previous value in the database and
|
||||
/// therefore we don't need to do a diff with what's already there.
|
||||
///
|
||||
/// If there is no Add side we currently write an empty buffer
|
||||
/// which is a valid CboRoaringBitmap.
|
||||
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||
pub fn deladd_serialize_add_side<'a>(
|
||||
obkv: &'a [u8],
|
||||
_buffer: &mut Vec<u8>,
|
||||
) -> crate::Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
533
crates/milli/src/update/facet/bulk.rs
Normal file
533
crates/milli/src/update/facet/bulk.rs
Normal file
|
@ -0,0 +1,533 @@
|
|||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::{CompressionType, Merger};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
/// by rebuilding the database "from scratch".
|
||||
///
|
||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||
/// higher levels are cleared and recomputed from the content of level 0.
|
||||
pub struct FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
FacetsUpdateBulk {
|
||||
index,
|
||||
field_ids,
|
||||
group_size,
|
||||
min_level_size,
|
||||
facet_type,
|
||||
delta_data: Some(delta_data),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_not_updating_level_0(
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
FacetsUpdateBulk {
|
||||
index,
|
||||
field_ids,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
delta_data: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
let db = match facet_type {
|
||||
FacetType::String => {
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
};
|
||||
|
||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||
|
||||
inner.update(wtxn, &field_ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub delta_data: Option<Merger<R, MergeFn>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> {
|
||||
self.update_level0(wtxn)?;
|
||||
for &field_id in field_ids.iter() {
|
||||
self.clear_levels(wtxn, field_id)?;
|
||||
}
|
||||
|
||||
for &field_id in field_ids.iter() {
|
||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
for level_reader in level_readers {
|
||||
let mut cursor = level_reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
self.db.remap_types::<Bytes, Bytes>().put(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> {
|
||||
let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] };
|
||||
let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] };
|
||||
let range = left..=right;
|
||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> {
|
||||
let delta_data = match self.delta_data.take() {
|
||||
Some(x) => x,
|
||||
None => return Ok(()),
|
||||
};
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.clear();
|
||||
// the group size for level 0
|
||||
buffer.push(1);
|
||||
// then we extend the buffer with the docids bitmap
|
||||
buffer.extend_from_slice(value);
|
||||
unsafe {
|
||||
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, &buffer)?
|
||||
};
|
||||
}
|
||||
} else {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
buffer.clear();
|
||||
buffer.push(1);
|
||||
// then we extend the buffer with the docids bitmap
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
None => {
|
||||
// it is safe to ignore the del in that case.
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
// won't put the key in DB as the value would be empty
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
let new_bitmap = &buffer[1..];
|
||||
// if the new bitmap is empty, let's remove it
|
||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||
database.delete(wtxn, key)?;
|
||||
} else {
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn compute_levels_for_field_id(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn<'_>,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok(subwriters)
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn read_level_0<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
field_id: u16,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
// we read the elements one by one and
|
||||
// 1. keep track of the left bound
|
||||
// 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read
|
||||
let mut bitmaps = vec![];
|
||||
|
||||
let mut level_0_prefix = vec![];
|
||||
level_0_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level_0_prefix.push(0);
|
||||
|
||||
let level_0_iter = self
|
||||
.db
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(rtxn, level_0_prefix.as_slice())?
|
||||
.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>();
|
||||
|
||||
let mut left_bound: &[u8] = &[];
|
||||
let mut first_iteration_for_new_group = true;
|
||||
for el in level_0_iter {
|
||||
let (key, value) = el?;
|
||||
let bound = key.left_bound;
|
||||
let docids = value.bitmap;
|
||||
|
||||
if first_iteration_for_new_group {
|
||||
left_bound = bound;
|
||||
first_iteration_for_new_group = false;
|
||||
}
|
||||
bitmaps.push(docids);
|
||||
|
||||
if bitmaps.len() == self.group_size as usize {
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
first_iteration_for_new_group = true;
|
||||
bitmaps.clear();
|
||||
}
|
||||
}
|
||||
// don't forget to give the leftover bitmaps as well
|
||||
if !bitmaps.is_empty() {
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
bitmaps.clear();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the content of the database levels from its level 0 for the given field id.
|
||||
///
|
||||
/// ## Returns:
|
||||
/// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1`
|
||||
/// that must be inserted into the database.
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn compute_higher_levels<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
if level == 0 {
|
||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||
// Level 0 is already in the database
|
||||
return Ok(vec![]);
|
||||
}
|
||||
// level >= 1
|
||||
// we compute each element of this level based on the elements of the level below it
|
||||
// once we have computed `level_group_size` elements, we give the left bound
|
||||
// of those elements, and their bitmaps, to the level above
|
||||
|
||||
let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
|
||||
let mut cur_writer_len: usize = 0;
|
||||
|
||||
let mut group_sizes = vec![];
|
||||
let mut left_bounds = vec![];
|
||||
let mut bitmaps = vec![];
|
||||
|
||||
// compute the levels below
|
||||
// in the callback, we fill `cur_writer` with the correct elements for this level
|
||||
let mut sub_writers = self.compute_higher_levels(
|
||||
rtxn,
|
||||
field_id,
|
||||
level - 1,
|
||||
&mut |sub_bitmaps, left_bound| {
|
||||
let mut combined_bitmap = RoaringBitmap::default();
|
||||
for bitmap in sub_bitmaps {
|
||||
combined_bitmap |= bitmap;
|
||||
}
|
||||
// The conversion of sub_bitmaps.len() to a u8 will always be correct
|
||||
// since its length is bounded by max_group_size, which is a u8.
|
||||
group_sizes.push(sub_bitmaps.len() as u8);
|
||||
left_bounds.push(left_bound);
|
||||
|
||||
bitmaps.push(combined_bitmap);
|
||||
if bitmaps.len() != self.group_size as usize {
|
||||
return Ok(());
|
||||
}
|
||||
let left_bound = left_bounds.first().unwrap();
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
|
||||
for ((bitmap, left_bound), group_size) in
|
||||
bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
|
||||
{
|
||||
let key = FacetGroupKey { field_id, level, left_bound };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
|
||||
.map_err(Error::Encoding)?;
|
||||
let value = FacetGroupValue { size: group_size, bitmap };
|
||||
let value =
|
||||
FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
|
||||
cur_writer.insert(key, value)?;
|
||||
cur_writer_len += 1;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
// don't forget to insert the leftover elements into the writer as well
|
||||
|
||||
// but only do so if the current number of elements to be inserted into this
|
||||
// levelcould grow to the minimum level size
|
||||
|
||||
if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) {
|
||||
// the length of bitmaps is between 0 and group_size
|
||||
assert!(bitmaps.len() < self.group_size as usize);
|
||||
assert!(cur_writer_len > 0);
|
||||
|
||||
let left_bound = left_bounds.first().unwrap();
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
|
||||
// Note: how many bitmaps are there here?
|
||||
for ((bitmap, left_bound), group_size) in
|
||||
bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
|
||||
{
|
||||
let key = FacetGroupKey { field_id, level, left_bound };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
|
||||
.map_err(Error::Encoding)?;
|
||||
let value = FacetGroupValue { size: group_size, bitmap };
|
||||
let value = FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
|
||||
cur_writer.insert(key, value)?;
|
||||
cur_writer_len += 1;
|
||||
}
|
||||
}
|
||||
// if we inserted enough elements to reach the minimum level size, then we push the writer
|
||||
if cur_writer_len >= self.min_level_size as usize {
|
||||
sub_writers.push(writer_into_reader(cur_writer)?);
|
||||
} else {
|
||||
// otherwise, if there are still leftover elements, we give them to the level above
|
||||
// this is necessary in order to get the union of all docids
|
||||
if !bitmaps.is_empty() {
|
||||
handle_group(&bitmaps, left_bounds.first().unwrap())?;
|
||||
}
|
||||
}
|
||||
Ok(sub_writers)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::once;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::facet::test_helpers::{ordered_string, FacetIndex};
|
||||
use crate::{db_snap, milli_snap};
|
||||
|
||||
#[test]
|
||||
fn insert() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index =
|
||||
FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..1_000u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
#[test]
|
||||
fn insert_delete_field_insert() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index =
|
||||
FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..100u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
// delete all the elements for the facet id 0
|
||||
for i in 0..100u32 {
|
||||
index.delete_single_docid(&mut wtxn, 0, &(i as f64), i);
|
||||
}
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
// then add some elements again for the facet id 1
|
||||
for i in 0..110u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3165() {
|
||||
// Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies)
|
||||
// would lead to a facet DB which was missing some levels.
|
||||
// That was because before writing a level into the database, we would
|
||||
// check that its size was higher than the minimum level size using
|
||||
// a lossy integer conversion: `level_size as u8 >= min_level_size`.
|
||||
//
|
||||
// This missing level in the facet DBs would make the incremental indexer
|
||||
// (and other search algorithms) crash.
|
||||
//
|
||||
// https://github.com/meilisearch/meilisearch/issues/3165
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("id") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..=22_540 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i as u64,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn insert_string() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index = FacetIndex::<StrRefCodec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
|
||||
let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::<Vec<_>>();
|
||||
let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new();
|
||||
for i in 0..1_000u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, &strings[i as usize]), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, &strings[i as usize]), once(i).collect()));
|
||||
}
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
}
|
1266
crates/milli/src/update/facet/incremental.rs
Normal file
1266
crates/milli/src/update/facet/incremental.rs
Normal file
File diff suppressed because it is too large
Load diff
641
crates/milli/src/update/facet/mod.rs
Normal file
641
crates/milli/src/update/facet/mod.rs
Normal file
|
@ -0,0 +1,641 @@
|
|||
/*!
|
||||
This module implements two different algorithms for updating the `facet_id_string_docids`
|
||||
and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
|
||||
it recreates the database from scratch when new elements are added to it. The second algorithm
|
||||
is incremental: it modifies the database as little as possible.
|
||||
|
||||
The databases must be able to return results for queries such as:
|
||||
1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y
|
||||
2. Min/Max : find the minimum/maximum facet value among these document ids
|
||||
3. Sort : sort these document ids by increasing/decreasing facet values
|
||||
4. Distribution : given some document ids, make a list of each facet value
|
||||
found in these documents along with the number of documents that contain it
|
||||
|
||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||
|
||||
To make these queries fast to compute, the database adopts a tree structure:
|
||||
```text
|
||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||
│Level 2│ │ │ │ │
|
||||
└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │
|
||||
├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤
|
||||
┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │
|
||||
│Level 1│ │ │ │ │ │ │
|
||||
└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │
|
||||
├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤
|
||||
┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │
|
||||
│Level 0│ │ │ │ │ │ │ │ │ │ │ │
|
||||
└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │
|
||||
└───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
|
||||
```
|
||||
In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
|
||||
contains the left bound of the range of facet values as well as the number of children of the node.
|
||||
The second line contains the document ids which have a facet value within the range of the node.
|
||||
The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
|
||||
|
||||
In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
|
||||
`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
|
||||
These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||
|
||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||
[`FacetGroupValue`], which have the following format:
|
||||
|
||||
```text
|
||||
FacetGroupKey:
|
||||
- field id : u16
|
||||
- level : u8
|
||||
- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str
|
||||
|
||||
FacetGroupValue:
|
||||
- #children : u8
|
||||
- docids : RoaringBitmap
|
||||
```
|
||||
|
||||
When the database is first created using the "bulk" method, each node has a fixed number of children
|
||||
(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`).
|
||||
The tree is also built such that the highest level has more than `min_level_size`
|
||||
(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
|
||||
|
||||
When the database is incrementally updated, the number of children of a node can vary between
|
||||
1 and `max_group_size`. This is done so that most incremental operations do not need to change
|
||||
the structure of the tree. When the number of children of a node reaches `max_group_size`,
|
||||
we split the node in two and update the number of children of its parent.
|
||||
|
||||
When adding documents to the databases, it is important to determine which method to use to
|
||||
minimise indexing time. The incremental method is faster when adding few new facet values, but the
|
||||
bulk method is faster when a large part of the database is modified. Empirically, it seems that
|
||||
it takes 50x more time to incrementally add N facet values to an existing database than it is to
|
||||
construct a database of N facet values. This is the heuristic that is used to choose between the
|
||||
two methods.
|
||||
|
||||
Related PR: https://github.com/meilisearch/milli/pull/619
|
||||
*/
|
||||
|
||||
pub const FACET_MAX_GROUP_SIZE: u8 = 8;
|
||||
pub const FACET_GROUP_SIZE: u8 = 4;
|
||||
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::Merger;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use self::incremental::FacetsUpdateIncremental;
|
||||
use super::FacetsUpdateBulk;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod incremental;
|
||||
|
||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
///
|
||||
/// Depending on the number of new elements and the existing size of the database, we use either
|
||||
/// a bulk update method or an incremental update method.
|
||||
pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
data_size: u64,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
data_size: u64,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => {
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
};
|
||||
Self {
|
||||
index,
|
||||
database,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
delta_data,
|
||||
normalized_delta_data,
|
||||
data_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
|
||||
if self.data_size == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.data_size >= (self.database.len(wtxn)? / 500) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
bulk_update.execute(wtxn)?;
|
||||
} else {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
);
|
||||
incremental_update.execute(wtxn)?;
|
||||
}
|
||||
|
||||
match self.normalized_delta_data {
|
||||
Some(data) => index_facet_search(wtxn, data, self.index),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn index_facet_search(
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
normalized_delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
index: &Index,
|
||||
) -> Result<()> {
|
||||
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, delta_bytes)) = iter.next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(delta_bytes);
|
||||
|
||||
let database_set = index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.get(wtxn, key_bytes)?
|
||||
.unwrap_or_default();
|
||||
|
||||
let add_set = deladd_reader
|
||||
.get(DelAdd::Addition)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let del_set = match deladd_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
{
|
||||
Some(del_set) => {
|
||||
let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
let mut set = BTreeSet::new();
|
||||
for facet in del_set {
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
|
||||
// Check if the referenced value doesn't exist anymore before deleting it.
|
||||
if index
|
||||
.facet_id_string_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(wtxn, &key)?
|
||||
.is_none()
|
||||
{
|
||||
set.insert(facet);
|
||||
}
|
||||
}
|
||||
set
|
||||
}
|
||||
None => BTreeSet::new(),
|
||||
};
|
||||
|
||||
let set: BTreeSet<_> =
|
||||
database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
|
||||
|
||||
if set.is_empty() {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.delete(wtxn, key_bytes)?;
|
||||
} else {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.put(wtxn, key_bytes, &set)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We clear the FST of normalized-for-search to compute everything from scratch.
|
||||
index.facet_id_string_fst.clear(wtxn)?;
|
||||
// We compute one FST by string facet
|
||||
let mut text_fsts = vec![];
|
||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
let database = index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let ((field_id, normalized_facet), _) = result?;
|
||||
current_fst = match current_fst.take() {
|
||||
Some((fid, fst_builder)) if fid != field_id => {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((fid, fst));
|
||||
Some((field_id, fst::SetBuilder::memory()))
|
||||
}
|
||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
None => Some((field_id, fst::SetBuilder::memory())),
|
||||
};
|
||||
|
||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
fst_builder.insert(normalized_facet)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((field_id, fst_builder)) = current_fst {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((field_id, fst));
|
||||
}
|
||||
|
||||
// We write those FSTs in LMDB now
|
||||
for (field_id, fst) in text_fsts {
|
||||
index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_helpers {
|
||||
use std::cell::Cell;
|
||||
use std::fmt::Display;
|
||||
use std::iter::FromIterator;
|
||||
use std::marker::PhantomData;
|
||||
use std::rc::Rc;
|
||||
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::bulk::FacetsUpdateBulkInner;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
/// Utility function to generate a string whose position in a lexicographically
|
||||
/// ordered list is `i`.
|
||||
pub fn ordered_string(mut i: usize) -> String {
|
||||
// The first string is empty
|
||||
if i == 0 {
|
||||
return String::new();
|
||||
}
|
||||
// The others are 5 char long, each between 'a' and 'z'
|
||||
let mut s = String::new();
|
||||
for _ in 0..5 {
|
||||
let (digit, next) = (i % 26, i / 26);
|
||||
s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap());
|
||||
i = next;
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// A dummy index that only contains the facet database, used for testing
|
||||
pub struct FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
pub env: Env,
|
||||
pub content: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub group_size: Cell<u8>,
|
||||
pub min_level_size: Cell<u8>,
|
||||
pub max_group_size: Cell<u8>,
|
||||
_tempdir: Rc<tempfile::TempDir>,
|
||||
_phantom: PhantomData<BoundCodec>,
|
||||
}
|
||||
|
||||
impl<BoundCodec> FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn open_from_tempdir(
|
||||
tempdir: Rc<tempfile::TempDir>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetIndex<BoundCodec> {
|
||||
let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16
|
||||
let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16
|
||||
let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17
|
||||
|
||||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 10 * 1000);
|
||||
unsafe {
|
||||
options.flag(heed::flags::Flags::MdbAlwaysFreePages);
|
||||
}
|
||||
let env = options.open(tempdir.path()).unwrap();
|
||||
let content = env.open_database(None).unwrap().unwrap();
|
||||
|
||||
FacetIndex {
|
||||
content,
|
||||
group_size: Cell::new(group_size),
|
||||
max_group_size: Cell::new(max_group_size),
|
||||
min_level_size: Cell::new(min_level_size),
|
||||
_tempdir: tempdir,
|
||||
env,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn new(
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetIndex<BoundCodec> {
|
||||
let group_size = group_size.clamp(2, 127);
|
||||
let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
|
||||
let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
|
||||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 1000 * 100);
|
||||
let tempdir = tempfile::TempDir::new().unwrap();
|
||||
let env = unsafe { options.open(tempdir.path()) }.unwrap();
|
||||
let mut wtxn = env.write_txn().unwrap();
|
||||
let content = env.create_database(&mut wtxn, None).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
FacetIndex {
|
||||
content,
|
||||
group_size: Cell::new(group_size),
|
||||
max_group_size: Cell::new(max_group_size),
|
||||
min_level_size: Cell::new(min_level_size),
|
||||
_tempdir: Rc::new(tempdir),
|
||||
env,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_group_size(&self, group_size: u8) {
|
||||
// 2 <= x <= 64
|
||||
self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2)));
|
||||
}
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_max_group_size(&self, max_group_size: u8) {
|
||||
// 2*group_size <= x <= 128
|
||||
let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size));
|
||||
self.max_group_size.set(max_group_size);
|
||||
if self.group_size.get() < max_group_size / 2 {
|
||||
self.group_size.set(max_group_size / 2);
|
||||
}
|
||||
}
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_min_level_size(&self, min_level_size: u8) {
|
||||
// 1 <= x <= inf
|
||||
self.min_level_size.set(std::cmp::max(1, min_level_size));
|
||||
}
|
||||
|
||||
pub fn insert<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docids: &RoaringBitmap,
|
||||
) {
|
||||
let update = FacetsUpdateIncrementalInner {
|
||||
db: self.content,
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
pub fn delete_single_docid<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docid: u32,
|
||||
) {
|
||||
self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid)))
|
||||
}
|
||||
|
||||
pub fn delete<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docids: &RoaringBitmap,
|
||||
) {
|
||||
let update = FacetsUpdateIncrementalInner {
|
||||
db: self.content,
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
|
||||
pub fn bulk_insert<'a, 'b>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_ids: &[u16],
|
||||
els: impl IntoIterator<
|
||||
Item = &'a ((u16, <BoundCodec as BytesEncode<'a>>::EItem), RoaringBitmap),
|
||||
>,
|
||||
) where
|
||||
for<'c> <BoundCodec as BytesEncode<'c>>::EItem: Sized,
|
||||
{
|
||||
let mut new_data = vec![];
|
||||
let mut writer = grenad::Writer::new(&mut new_data);
|
||||
for ((field_id, left_bound), docids) in els {
|
||||
let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned();
|
||||
let key: FacetGroupKey<&[u8]> =
|
||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
|
||||
let mut inner_writer = KvWriterDelAdd::memory();
|
||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
builder.push(reader.into_cursor().unwrap());
|
||||
let merger = builder.build();
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
delta_data: Some(merger),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
||||
update.update(wtxn, field_ids).unwrap();
|
||||
}
|
||||
|
||||
pub fn verify_structure_validity(&self, txn: &RoTxn<'_>, field_id: u16) {
|
||||
let mut field_id_prefix = vec![];
|
||||
field_id_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let highest_level = get_highest_level(txn, self.content, field_id).unwrap();
|
||||
|
||||
for level_no in (1..=highest_level).rev() {
|
||||
let mut level_no_prefix = vec![];
|
||||
level_no_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level_no_prefix.push(level_no);
|
||||
|
||||
let iter = self
|
||||
.content
|
||||
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||
.prefix_iter(txn, &level_no_prefix)
|
||||
.unwrap();
|
||||
for el in iter {
|
||||
let (key, value) = el.unwrap();
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
|
||||
|
||||
let mut prefix_start_below = vec![];
|
||||
prefix_start_below.extend_from_slice(&field_id.to_be_bytes());
|
||||
prefix_start_below.push(level_no - 1);
|
||||
prefix_start_below.extend_from_slice(key.left_bound);
|
||||
|
||||
let start_below = {
|
||||
let mut start_below_iter = self
|
||||
.content
|
||||
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||
.prefix_iter(txn, &prefix_start_below)
|
||||
.unwrap();
|
||||
let (key_bytes, _) = start_below_iter.next().unwrap().unwrap();
|
||||
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes).unwrap()
|
||||
};
|
||||
|
||||
assert!(value.size > 0);
|
||||
|
||||
let mut actual_size = 0;
|
||||
let mut values_below = RoaringBitmap::new();
|
||||
let iter_below = self
|
||||
.content
|
||||
.range(txn, &(start_below..))
|
||||
.unwrap()
|
||||
.take(value.size as usize);
|
||||
for el in iter_below {
|
||||
let (_, value) = el.unwrap();
|
||||
actual_size += 1;
|
||||
values_below |= value.bitmap;
|
||||
}
|
||||
assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}");
|
||||
|
||||
assert_eq!(value.bitmap, values_below);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<BoundCodec> Display for FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized + Display,
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let txn = self.env.read_txn().unwrap();
|
||||
let iter = self.content.iter(&txn).unwrap();
|
||||
for el in iter {
|
||||
let (key, value) = el.unwrap();
|
||||
let FacetGroupKey { field_id, level, left_bound: bound } = key;
|
||||
let bound = BoundCodec::bytes_decode(bound).unwrap();
|
||||
let FacetGroupValue { size, bitmap } = value;
|
||||
writeln!(
|
||||
f,
|
||||
"{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}",
|
||||
values = display_bitmap(&bitmap)
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
use std::iter::once;
|
||||
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::test_helpers::FacetIndex;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
|
||||
// This is a simple test to get an intuition on the relative speed
|
||||
// of the incremental vs. bulk indexer.
|
||||
//
|
||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
||||
// each facet value contains only one document ID.
|
||||
//
|
||||
// In that scenario, it appears that the incremental indexer is about 50 times slower than the
|
||||
// bulk indexer.
|
||||
// #[test]
|
||||
fn benchmark_facet_indexing() {
|
||||
let mut facet_value = 0;
|
||||
|
||||
let mut r = rand::thread_rng();
|
||||
|
||||
for i in 1..=20 {
|
||||
let size = 50_000 * i;
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..size {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, facet_value as f64), once(i).collect()));
|
||||
facet_value += 1;
|
||||
}
|
||||
let timer = std::time::Instant::now();
|
||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!("bulk {size} : {time_spent}ms");
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let timer = std::time::Instant::now();
|
||||
//
|
||||
// insert one document
|
||||
//
|
||||
for _ in 0..nbr_doc {
|
||||
index.insert(&mut txn, 0, &r.gen(), &once(1).collect());
|
||||
}
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!(" add {nbr_doc} : {time_spent}ms");
|
||||
txn.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
b40dd31a65e033ffc6b35c027ce19506
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
7ee22d8e9387e72758f00918eb67e4c6
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
60f567359382507afdaf45fb075740c3
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
b986d6e6cbf425685f409a8b417010e1
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
ee10dd2ae2b5c6621a89a5d0a9aa8ccc
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
fa877559eef78b383b496c15a364a2dc
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
16a96353bc42f2ff3e91611ca4d5b184
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
be1b08073b9d9788d18080c1320151d7
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
16a96353bc42f2ff3e91611ca4d5b184
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
32a45d555df2e001420fea149818d376
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
353d70f52eea66e5031dca989ea8a037
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
52a093c909133d84023a4a7b83864808
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
9d86c72ddb241d0aeca2995d61a3648a
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
c0943177594534bfe5527cbf40fe388e
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
6ed86f234028ae3df5881bee5512f11e
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
5dbfa134cc44abeb3ab6242fc182e48e
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
6ed7bf5d440599b3b10b37549a271fdf
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[0, ]"
|
||||
0 0 k1 1 "[1, ]"
|
||||
0 0 k2 1 "[2, ]"
|
||||
0 0 k3 1 "[3, ]"
|
||||
0 0 k4 1 "[4, ]"
|
||||
0 0 k5 1 "[5, ]"
|
||||
0 0 k6 1 "[6, ]"
|
||||
0 0 k7 1 "[7, ]"
|
||||
0 0 k8 1 "[8, ]"
|
||||
0 0 k9 1 "[9, ]"
|
||||
0 0 k10 1 "[10, ]"
|
||||
0 0 k11 1 "[11, ]"
|
||||
0 0 k12 1 "[12, ]"
|
||||
0 0 k13 1 "[13, ]"
|
||||
0 0 k14 1 "[14, ]"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b5203f0df0036ebaa133dd77d63a00eb
|
|
@ -0,0 +1,26 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[0, ]"
|
||||
0 0 k1 1 "[1, ]"
|
||||
0 0 k2 1 "[2, ]"
|
||||
0 0 k3 1 "[3, ]"
|
||||
0 0 k4 1 "[4, ]"
|
||||
0 0 k5 1 "[5, ]"
|
||||
0 0 k6 1 "[6, ]"
|
||||
0 0 k7 1 "[7, ]"
|
||||
0 0 k8 1 "[8, ]"
|
||||
0 0 k9 1 "[9, ]"
|
||||
0 0 k10 1 "[10, ]"
|
||||
0 0 k11 1 "[11, ]"
|
||||
0 0 k12 1 "[12, ]"
|
||||
0 0 k13 1 "[13, ]"
|
||||
0 0 k14 1 "[14, ]"
|
||||
0 0 k15 1 "[15, ]"
|
||||
0 0 k16 1 "[16, ]"
|
||||
0 1 k0 4 "[0, 1, 2, 3, ]"
|
||||
0 1 k4 4 "[4, 5, 6, 7, ]"
|
||||
0 1 k8 4 "[8, 9, 10, 11, ]"
|
||||
0 1 k12 4 "[12, 13, 14, 15, ]"
|
||||
0 1 k16 1 "[16, ]"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
95497d8579740868ee0bfc655b0bf782
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
d565c2f7bbd9e13e12de40cfbbfba6bb
|
|
@ -0,0 +1,54 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k216 1 "[216, ]"
|
||||
0 0 k217 1 "[217, ]"
|
||||
0 0 k218 1 "[218, ]"
|
||||
0 0 k219 1 "[219, ]"
|
||||
0 0 k220 1 "[220, ]"
|
||||
0 0 k221 1 "[221, ]"
|
||||
0 0 k222 1 "[222, ]"
|
||||
0 0 k223 1 "[223, ]"
|
||||
0 0 k224 1 "[224, ]"
|
||||
0 0 k225 1 "[225, ]"
|
||||
0 0 k226 1 "[226, ]"
|
||||
0 0 k227 1 "[227, ]"
|
||||
0 0 k228 1 "[228, ]"
|
||||
0 0 k229 1 "[229, ]"
|
||||
0 0 k230 1 "[230, ]"
|
||||
0 0 k231 1 "[231, ]"
|
||||
0 0 k232 1 "[232, ]"
|
||||
0 0 k233 1 "[233, ]"
|
||||
0 0 k234 1 "[234, ]"
|
||||
0 0 k235 1 "[235, ]"
|
||||
0 0 k236 1 "[236, ]"
|
||||
0 0 k237 1 "[237, ]"
|
||||
0 0 k238 1 "[238, ]"
|
||||
0 0 k239 1 "[239, ]"
|
||||
0 0 k240 1 "[240, ]"
|
||||
0 0 k241 1 "[241, ]"
|
||||
0 0 k242 1 "[242, ]"
|
||||
0 0 k243 1 "[243, ]"
|
||||
0 0 k244 1 "[244, ]"
|
||||
0 0 k245 1 "[245, ]"
|
||||
0 0 k246 1 "[246, ]"
|
||||
0 0 k247 1 "[247, ]"
|
||||
0 0 k248 1 "[248, ]"
|
||||
0 0 k249 1 "[249, ]"
|
||||
0 0 k250 1 "[250, ]"
|
||||
0 0 k251 1 "[251, ]"
|
||||
0 0 k252 1 "[252, ]"
|
||||
0 0 k253 1 "[253, ]"
|
||||
0 0 k254 1 "[254, ]"
|
||||
0 0 k255 1 "[255, ]"
|
||||
0 1 k216 4 "[216, 217, 218, 219, ]"
|
||||
0 1 k220 4 "[220, 221, 222, 223, ]"
|
||||
0 1 k224 4 "[224, 225, 226, 227, ]"
|
||||
0 1 k228 4 "[228, 229, 230, 231, ]"
|
||||
0 1 k232 4 "[232, 233, 234, 235, ]"
|
||||
0 1 k236 4 "[236, 237, 238, 239, ]"
|
||||
0 1 k240 4 "[240, 241, 242, 243, ]"
|
||||
0 1 k244 4 "[244, 245, 246, 247, ]"
|
||||
0 1 k248 4 "[248, 249, 250, 251, ]"
|
||||
0 1 k252 4 "[252, 253, 254, 255, ]"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7cb503827ba17e9670296cc9531a1380
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b061f43e379e16f0617c05d3313d0078
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
81fc9489d6b163935b97433477dea63b
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b17b2c4ec87a778aae07854c96c08b48
|
|
@ -0,0 +1,20 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[3, 435, 583, 849, ]"
|
||||
0 0 k1 1 "[35, 494, 693, 796, ]"
|
||||
0 0 k2 1 "[76, 420, 526, 909, ]"
|
||||
0 0 k3 1 "[133, 451, 653, 806, ]"
|
||||
0 0 k4 1 "[131, 464, 656, 853, ]"
|
||||
0 0 k5 1 "[61, 308, 701, 903, ]"
|
||||
0 0 k6 1 "[144, 449, 674, 794, ]"
|
||||
0 0 k7 1 "[182, 451, 735, 941, ]"
|
||||
0 0 k8 1 "[6, 359, 679, 1003, ]"
|
||||
0 0 k9 1 "[197, 418, 659, 904, ]"
|
||||
0 0 k10 1 "[88, 297, 567, 800, ]"
|
||||
0 0 k11 1 "[150, 309, 530, 946, ]"
|
||||
0 0 k12 1 "[156, 466, 567, 892, ]"
|
||||
0 0 k13 1 "[46, 425, 610, 807, ]"
|
||||
0 0 k14 1 "[236, 433, 549, 891, ]"
|
||||
0 0 k15 1 "[207, 472, 603, 974, ]"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7f8aa18d2b3a6422d55c03bede0563db
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7f8aa18d2b3a6422d55c03bede0563db
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b3e2de9020d9e0f3941bc3a179c795ba
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
5dbfa134cc44abeb3ab6242fc182e48e
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
9343355bf535ed4a0c956df2b229d5e6
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
4fc800f49201a336295af0542fdf01ab
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
fd65ce7d96a07aafb0ef6cfb5bf016b8
|
263
crates/milli/src/update/index_documents/enrich.rs
Normal file
263
crates/milli/src/update/index_documents/enrich.rs
Normal file
|
@ -0,0 +1,263 @@
|
|||
use std::fmt;
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::documents::{
|
||||
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::error::{GeoError, InternalError, UserError};
|
||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// This function validates and enrich the documents by checking that:
|
||||
/// - we can infer a primary key,
|
||||
/// - all the documents id exist and are extracted,
|
||||
/// - the validity of them but also,
|
||||
/// - the validity of the `_geo` field depending on the settings.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if reader.is_empty(), this function may panic in some cases
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
|
||||
pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
autogenerate_docids: bool,
|
||||
reader: DocumentsBatchReader<R>,
|
||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
// we will guess by searching for the first key that contains "id" as a substring.
|
||||
let primary_key = match index.primary_key(rtxn)? {
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||
Some(primary_key) => primary_key,
|
||||
None if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: primary_key,
|
||||
field_id: documents_batch_index.insert(primary_key),
|
||||
},
|
||||
None => {
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
||||
})),
|
||||
None => unreachable!("Called with reader.is_empty()"),
|
||||
};
|
||||
}
|
||||
},
|
||||
None => {
|
||||
let mut guesses: Vec<(u16, &str)> = documents_batch_index
|
||||
.iter()
|
||||
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||
.map(|(field_id, name)| (*field_id, name.as_str()))
|
||||
.collect();
|
||||
|
||||
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
|
||||
guesses.sort_by(|(_, left_name), (_, right_name)| {
|
||||
// shortest name first
|
||||
left_name.len().cmp(&right_name.len()).then_with(
|
||||
// then alphabetical order
|
||||
|| left_name.cmp(right_name),
|
||||
)
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: DEFAULT_PRIMARY_KEY,
|
||||
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
},
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::Flat { name, field_id: *field_id }
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
candidates: multiple
|
||||
.iter()
|
||||
.map(|(_, candidate)| candidate.to_string())
|
||||
.collect(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// If the settings specifies that a _geo field must be used therefore we must check the
|
||||
// validity of it in all the documents of this batch and this is when we return `Some`.
|
||||
let geo_field_id = match documents_batch_index.id("_geo") {
|
||||
Some(geo_field_id)
|
||||
if index.sortable_fields(rtxn)?.contains("_geo")
|
||||
|| index.filterable_fields(rtxn)?.contains("_geo") =>
|
||||
{
|
||||
Some(geo_field_id)
|
||||
}
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
let mut count = 0;
|
||||
while let Some(document) = cursor.next_document()? {
|
||||
let document_id = match fetch_or_generate_document_id(
|
||||
&document,
|
||||
&documents_batch_index,
|
||||
primary_key,
|
||||
autogenerate_docids,
|
||||
&mut uuid_buffer,
|
||||
count,
|
||||
)? {
|
||||
Ok(document_id) => document_id,
|
||||
Err(user_error) => return Ok(Err(user_error)),
|
||||
};
|
||||
|
||||
if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
|
||||
if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
|
||||
return Ok(Err(UserError::from(user_error)));
|
||||
}
|
||||
}
|
||||
|
||||
let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
|
||||
external_ids.insert(count.to_be_bytes(), document_id)?;
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
let external_ids = writer_into_reader(external_ids)?;
|
||||
let primary_key_name = primary_key.name().to_string();
|
||||
let reader = EnrichedDocumentsBatchReader::new(
|
||||
DocumentsBatchReader::new(cursor, documents_batch_index),
|
||||
primary_key_name,
|
||||
external_ids,
|
||||
)?;
|
||||
|
||||
Ok(Ok(reader))
|
||||
}
|
||||
|
||||
/// Retrieve the document id after validating it, returning a `UserError`
|
||||
/// if the id is invalid or can't be guessed.
|
||||
#[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document)
|
||||
target = "indexing::documents")]
|
||||
fn fetch_or_generate_document_id(
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
documents_batch_index: &DocumentsBatchIndex,
|
||||
primary_key: PrimaryKey<'_>,
|
||||
autogenerate_docids: bool,
|
||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||
count: u32,
|
||||
) -> Result<StdResult<DocumentId, UserError>> {
|
||||
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||
}
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}),
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||
///
|
||||
/// In case the document id has been auto-generated, the document nth is kept to help
|
||||
/// users debug if there is an issue with the document itself.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum DocumentId {
|
||||
Retrieved { value: String },
|
||||
Generated { value: String, document_nth: u32 },
|
||||
}
|
||||
|
||||
impl DocumentId {
|
||||
fn debug(&self) -> String {
|
||||
format!("{:?}", self)
|
||||
}
|
||||
|
||||
pub fn is_generated(&self) -> bool {
|
||||
matches!(self, DocumentId::Generated { .. })
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &str {
|
||||
match self {
|
||||
DocumentId::Retrieved { value } => value,
|
||||
DocumentId::Generated { value, .. } => value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DocumentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
DocumentId::Retrieved { value } => write!(f, "{:?}", value),
|
||||
DocumentId::Generated { value, document_nth } => {
|
||||
write!(f, "{{{:?}}} of the {}nth document", value, document_nth)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||
/// in the `Err` variant if it failed.
|
||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
let number = match value {
|
||||
Value::Number(ref n) => match n.as_f64() {
|
||||
Some(number) => number,
|
||||
None => return Err(value),
|
||||
},
|
||||
Value::String(ref s) => match s.parse::<f64>() {
|
||||
Ok(number) => number,
|
||||
Err(_) => return Err(value),
|
||||
},
|
||||
value => return Err(value),
|
||||
};
|
||||
|
||||
if number.is_finite() {
|
||||
Ok(number)
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
|
||||
use GeoError::*;
|
||||
let debug_id = || {
|
||||
serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
|
||||
};
|
||||
match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
|
||||
(Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
|
||||
document_id: debug_id(),
|
||||
value: object.into(),
|
||||
})),
|
||||
(Ok(_), Ok(_)) => Ok(Ok(())),
|
||||
(Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
|
||||
(Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
|
||||
(Err(lat), Err(lng)) => {
|
||||
Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng }))
|
||||
}
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })),
|
||||
(Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
|
||||
(None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
|
||||
},
|
||||
Value::Null => Ok(Ok(())),
|
||||
value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,318 @@
|
|||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
///
|
||||
/// Returns the generated internal documents ids and a grenad reader
|
||||
/// with the list of extracted words from the given chunk of documents.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_latest_obkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
// initialize buffers.
|
||||
let mut del_buffers = Buffers::default();
|
||||
let mut add_buffers = Buffers::default();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> = settings_diff
|
||||
.old
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let del_builder =
|
||||
tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
|
||||
let del_tokenizer = del_builder.into_tokenizer();
|
||||
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
.new
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let add_builder =
|
||||
tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
|
||||
let add_tokenizer = add_builder.into_tokenizer();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
|
||||
// Update key buffer prefix.
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
// Tokenize deletions and additions in 2 diffferent threads.
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
)
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
let del_obkv = del?;
|
||||
let add_obkv = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
&KvReader::<FieldId>::new(del_obkv),
|
||||
&KvReader::<FieldId>::new(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<'_, FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<'a, FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer<'_>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
) -> Result<&'a [u8]> {
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
// prepare writing destination.
|
||||
buffers.obkv_positions_buffer.clear();
|
||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||
|
||||
// convert json into a unique string.
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = settings.localized_searchable_fields_ids.locales(field_id);
|
||||
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
writer.insert(position, token.as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
// write positions into document.
|
||||
let positions = writer.into_inner()?;
|
||||
document_writer.insert(field_id, positions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Value::String(string) = value {
|
||||
Some(string)
|
||||
} else if inner(value, buffer) {
|
||||
Some(buffer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standard proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
// buffer used to store the value data containing an obkv.
|
||||
obkv_buffer: Vec<u8>,
|
||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||
obkv_positions_buffer: Vec<u8>,
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet numbers and
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
}
|
|
@ -0,0 +1,299 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
use std::{io, str};
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use charabia::{Language, StrDetection, Token};
|
||||
use heed::types::SerdeJson;
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
|
||||
if is_same_value && are_same_locales {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
let new_hyper_normalized_value = if are_same_locales {
|
||||
&old_hyper_normalized_value
|
||||
} else {
|
||||
&normalize_facet_string(normalized_value, new_locales)
|
||||
};
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
} else {
|
||||
// if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
|
||||
// deletion
|
||||
if deladd_reader.get(DelAdd::Deletion).is_some() {
|
||||
// insert old value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
// addition
|
||||
if deladd_reader.get(DelAdd::Addition).is_some() {
|
||||
// insert new value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Normalizes the facet string and truncates it to the max length.
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
let script = detection.script();
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script,
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// truncate the facet string to the max length
|
||||
token
|
||||
.normalize(&options)
|
||||
.lemma
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
|
@ -0,0 +1,580 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use bytemuck::bytes_of;
|
||||
use grenad::Sorter;
|
||||
use heed::BytesEncode;
|
||||
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
// The tuples represents the Del and Add side for a bitmap
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
|
||||
// We create two buffers for mutable ref issues with closures.
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let get_document_json_value = move |field_id, side| {
|
||||
obkv.get(field_id)
|
||||
.map(KvReaderDelAdd::new)
|
||||
.and_then(|kv| kv.get(side))
|
||||
.map(from_slice)
|
||||
.transpose()
|
||||
.map_err(InternalError::SerdeJson)
|
||||
};
|
||||
// iterate over the faceted fields instead of over the whole document.
|
||||
for eob in
|
||||
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
|
||||
old.cmp(new)
|
||||
})
|
||||
{
|
||||
let (field_id, del_value, add_value) = match eob {
|
||||
EitherOrBoth::Left(&field_id) => {
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
|
||||
// deletion only
|
||||
(field_id, del_value, None)
|
||||
}
|
||||
EitherOrBoth::Right(&field_id) => {
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
// addition only
|
||||
(field_id, None, add_value)
|
||||
}
|
||||
EitherOrBoth::Both(&field_id, _) => {
|
||||
// during settings update, recompute the changing settings only.
|
||||
if settings_diff.settings_update_only {
|
||||
continue;
|
||||
}
|
||||
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
(field_id, del_value, add_value)
|
||||
}
|
||||
};
|
||||
|
||||
if del_value.is_some() || add_value.is_some() {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = DocumentId::from_be_bytes(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let del_geo_support = settings_diff
|
||||
.old
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let add_geo_support = settings_diff
|
||||
.new
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, del_geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, add_geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||
buffer: &mut Vec<u8>,
|
||||
del_bitmap: &RoaringBitmap,
|
||||
add_bitmap: &RoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||
obkv.finish()
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_str(s: &str) -> &str {
|
||||
let index = s
|
||||
.char_indices()
|
||||
.map(|(idx, _)| idx)
|
||||
.chain(std::iter::once(s.len()))
|
||||
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||
.last();
|
||||
|
||||
&s[..index.unwrap_or(0)]
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff<MF>(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
|
||||
let merged_numbers_iter = itertools::merge_join_by(
|
||||
del_numbers.into_iter().map(OrderedFloat),
|
||||
add_numbers.into_iter().map(OrderedFloat),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for eob in merged_numbers_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff<MF>(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
del_strings.dedup();
|
||||
add_strings.dedup();
|
||||
|
||||
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
|
||||
let merged_strings_iter = itertools::merge_join_by(
|
||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
|
||||
);
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for eob in merged_strings_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let (side, normalized, original) = match eob {
|
||||
EitherOrBoth::Both((normalized, del), (_, add)) => {
|
||||
let merged_strings_iter =
|
||||
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
|
||||
original_del.cmp(original_add)
|
||||
});
|
||||
|
||||
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
|
||||
// but we possibly have multiple original values that changed in the case where the field is an
|
||||
// array of multiple values that normalize to the same value.
|
||||
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
|
||||
//
|
||||
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
|
||||
// value that is only in the old version, and adding the first value that is only in the new version
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
let mut del = None;
|
||||
let mut add = None;
|
||||
let mut both = None;
|
||||
|
||||
for eob in merged_strings_iter {
|
||||
match eob {
|
||||
EitherOrBoth::Both((_normalized, original), _) => {
|
||||
both = match both {
|
||||
Some(both) => Some(both),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, original)) => {
|
||||
del = match del {
|
||||
Some(del) => Some(del),
|
||||
None => Some(original),
|
||||
};
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, original)) => {
|
||||
add = match add {
|
||||
Some(add) => Some(add),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(del) = del {
|
||||
obkv.insert(DelAdd::Deletion, del)?;
|
||||
}
|
||||
if let Some(add) = add
|
||||
// prefer the newly added, but if there is none, keep a value in the list of values
|
||||
// since the normalized value appears both in old and new, we should never remove it.
|
||||
.or(both)
|
||||
{
|
||||
obkv.insert(DelAdd::Addition, add)?;
|
||||
}
|
||||
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
continue;
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Deletion, normalized, original)
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Addition, normalized, original)
|
||||
}
|
||||
};
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(side, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
/// Extracts the facet values of a JSON field.
|
||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
geo_field: bool,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
|
||||
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
|
||||
if geo_field {
|
||||
if let Ok(float) = original.parse() {
|
||||
output_numbers.push(float);
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Internal error, could not parse a geofield that has been validated. Please open an issue."
|
||||
)
|
||||
}
|
||||
}
|
||||
let normalized = crate::normalize_facet(original);
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(
|
||||
value,
|
||||
false,
|
||||
output_numbers,
|
||||
output_strings,
|
||||
geo_field,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted field id word counts
|
||||
/// and documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let reader = KvReaderDelAdd::new(obkv.get(primary_key_id).unwrap());
|
||||
let document_id =
|
||||
reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// extract old version
|
||||
let del_lat_lng =
|
||||
extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
|
||||
// extract new version
|
||||
let add_lat_lng =
|
||||
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
|
||||
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
deladd: DelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<Option<[f64; 2]>> {
|
||||
match settings.geo_fields_ids {
|
||||
Some((lat_fid, lng_fid)) => {
|
||||
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
|
||||
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
|
||||
let (lat, lng) = match (lat, lng) {
|
||||
(Some(lat), Some(lng)) => (lat, lng),
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => return Ok(None),
|
||||
};
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
Ok(Some([lat, lng]))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,838 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::FaultSource;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::settings::ReindexAction;
|
||||
use crate::vector::{Embedder, Embeddings};
|
||||
use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
pub struct ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
pub manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> ()
|
||||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
pub add_to_user_provided: RoaringBitmap,
|
||||
pub remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
NoChange,
|
||||
// Remove all vectors, generated or manual, from this document
|
||||
NowRemoved,
|
||||
|
||||
NowManual(Vec<Vec<f32>>),
|
||||
|
||||
// Add the vector computed from the specified prompt
|
||||
// Remove any previous vector
|
||||
// Note: changing the value of the prompt **does require** recording this delta
|
||||
NowGenerated(String),
|
||||
}
|
||||
|
||||
impl VectorStateDelta {
|
||||
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
|
||||
match self {
|
||||
VectorStateDelta::NoChange => Default::default(),
|
||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||
// We always delete the previous vectors
|
||||
VectorStateDelta::NowManual(add) => (true, Default::default(), add),
|
||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct EmbedderVectorExtractor {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
prompt: Arc<Prompt>,
|
||||
|
||||
// (docid) -> (prompt)
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||
// The docids of the documents that contains a user defined embedding
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
|
||||
action: ExtractionAction,
|
||||
}
|
||||
|
||||
struct DocumentOperation {
|
||||
// The docids of the documents that contains an auto-generated embedding
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum ExtractionAction {
|
||||
SettingsFullReindex,
|
||||
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
|
||||
DocumentOperation(DocumentOperation),
|
||||
}
|
||||
|
||||
struct ManualEmbedderErrors {
|
||||
embedder_name: String,
|
||||
docid: String,
|
||||
other_docids: usize,
|
||||
}
|
||||
|
||||
impl ManualEmbedderErrors {
|
||||
pub fn push_error(
|
||||
errors: &mut Option<ManualEmbedderErrors>,
|
||||
embedder_name: &str,
|
||||
document_id: impl Fn() -> Value,
|
||||
) {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
if errors.embedder_name == embedder_name {
|
||||
errors.other_docids = errors.other_docids.saturating_add(1)
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*errors = Some(Self {
|
||||
embedder_name: embedder_name.to_owned(),
|
||||
docid: document_id().to_string(),
|
||||
other_docids: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_result(
|
||||
errors: Option<ManualEmbedderErrors>,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
) -> Result<()> {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
let embedder_name = &errors.embedder_name;
|
||||
let mut msg = format!(
|
||||
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}",
|
||||
errors.docid,
|
||||
if errors.other_docids != 0 {
|
||||
format!(" and at least {} other document(s)", errors.other_docids)
|
||||
} else {
|
||||
"".to_string()
|
||||
}
|
||||
);
|
||||
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedders_configs: &[IndexEmbeddingConfig],
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
|
||||
let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
|
||||
let mut manual_errors = None;
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let old_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
let new_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut extractors = Vec::new();
|
||||
|
||||
let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
|
||||
let old_configs = &settings_diff.old.embedding_configs;
|
||||
|
||||
if reindex_vectors {
|
||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
||||
if let Some(action) = action.reindex() {
|
||||
let Some((embedder_name, (embedder, prompt, _quantized))) =
|
||||
configs.remove_entry(name)
|
||||
else {
|
||||
tracing::error!(embedder = name, "Requested embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let action = match action {
|
||||
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
|
||||
ReindexAction::RegeneratePrompts => {
|
||||
let Some((_, old_prompt, _quantized)) = old_configs.get(name) else {
|
||||
tracing::error!(embedder = name, "Old embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
|
||||
}
|
||||
};
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action,
|
||||
});
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// document operation
|
||||
|
||||
for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() {
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action: ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided: RoaringBitmap::new(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(
|
||||
docid,
|
||||
embedders_configs,
|
||||
obkv,
|
||||
old_vectors_fid,
|
||||
new_vectors_fid,
|
||||
)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
action,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_));
|
||||
|
||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
||||
let delta = match action {
|
||||
ExtractionAction::SettingsFullReindex => match old {
|
||||
// A full reindex can be triggered either by:
|
||||
// 1. a new embedder
|
||||
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
||||
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
||||
VectorState::Inline(vectors) => {
|
||||
if !vectors.must_regenerate() {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
|
||||
match vectors.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(
|
||||
crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
),
|
||||
));
|
||||
}
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
}
|
||||
}
|
||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
||||
VectorState::Manual => VectorStateDelta::NoChange,
|
||||
// generated vectors must be regenerated
|
||||
VectorState::Generated => {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||
}
|
||||
},
|
||||
// prompt regeneration is only triggered for existing embedders
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
||||
if old.must_regenerate() {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_if_prompt_changed(
|
||||
obkv,
|
||||
(old_prompt, prompt),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
)?
|
||||
} else {
|
||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||
// already in the DB since this is an existing embedder
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
}
|
||||
ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) => extract_vector_document_diff(
|
||||
docid,
|
||||
obkv,
|
||||
prompt,
|
||||
(add_to_user_provided, remove_from_user_provided),
|
||||
(old, new),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
document_id,
|
||||
embedder_name,
|
||||
embedder_is_manual,
|
||||
&mut manual_errors,
|
||||
)?,
|
||||
};
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
remove_vectors_writer,
|
||||
prompts_writer,
|
||||
manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
)?;
|
||||
}
|
||||
|
||||
unused_vectors_distribution.append(parsed_vectors);
|
||||
}
|
||||
|
||||
ManualEmbedderErrors::to_result(
|
||||
manual_errors,
|
||||
possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt: _,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
action,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
} in extractors
|
||||
{
|
||||
let remove_from_user_provided =
|
||||
if let ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) = action
|
||||
{
|
||||
remove_from_user_provided
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
results.push(ExtractedVectorPoints {
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
embedder,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
})
|
||||
}
|
||||
|
||||
Ok((results, unused_vectors_distribution))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
|
||||
fn extract_vector_document_diff(
|
||||
docid: DocumentId,
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
document_id: impl Fn() -> Value,
|
||||
embedder_name: &str,
|
||||
embedder_is_manual: bool,
|
||||
manual_errors: &mut Option<ManualEmbedderErrors>,
|
||||
) -> Result<VectorStateDelta> {
|
||||
match (old.must_regenerate(), new.must_regenerate()) {
|
||||
(true, true) | (false, false) => {}
|
||||
(true, false) => {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
remove_from_user_provided.insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
let delta = match (old, new) {
|
||||
// regardless of the previous state, if a document now contains inline _vectors, they must
|
||||
// be extracted manually
|
||||
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
},
|
||||
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
||||
// document changed
|
||||
(VectorState::Generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(&prompt).map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
||||
});
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
|
||||
// the previous version of the document.
|
||||
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
|
||||
// Generated -> Generated is handled above, so not possible
|
||||
// As a result, this code is unreachable
|
||||
(_not_generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
|
||||
// version of the document.
|
||||
// however the Rust type system cannot know that.
|
||||
(_manual, VectorState::Manual) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// if the new version of documents has the vectors in the DB,
|
||||
// then they are user-provided and nothing possibly changed
|
||||
VectorStateDelta::NoChange
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(delta)
|
||||
}
|
||||
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
) -> Result<VectorStateDelta> {
|
||||
let old_prompt =
|
||||
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
|
||||
let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
if new_prompt == old_prompt {
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
Ok(VectorStateDelta::NowGenerated(new_prompt))
|
||||
}
|
||||
|
||||
fn regenerate_prompt(
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
Ok(VectorStateDelta::NowGenerated(prompt))
|
||||
}
|
||||
|
||||
/// We cannot compute the diff between both Del and Add vectors.
|
||||
/// We'll push every vector and compute the difference later in TypedChunk.
|
||||
fn push_vectors_diff(
|
||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
) -> Result<()> {
|
||||
let (must_remove, prompt, mut add_vectors) = delta.into_values();
|
||||
if must_remove {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
}
|
||||
if !prompt.is_empty() {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
prompts_writer.insert(&key_buffer, prompt.as_bytes())?;
|
||||
}
|
||||
|
||||
// We sort and dedup the vectors
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
// docid, state with embedding
|
||||
let mut state_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut chunks = Vec::with_capacity(n_chunks);
|
||||
let mut current_chunk = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut current_chunk_ids = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut chunks_ids = Vec::with_capacity(n_chunks);
|
||||
let mut cursor = prompt_reader.into_cursor()?;
|
||||
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// SAFETY: precondition, the grenad value was saved from a string
|
||||
let prompt = unsafe { std::str::from_utf8_unchecked(value) };
|
||||
if current_chunk.len() == current_chunk.capacity() {
|
||||
chunks.push(std::mem::replace(
|
||||
&mut current_chunk,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
chunks_ids.push(std::mem::replace(
|
||||
&mut current_chunk_ids,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
};
|
||||
current_chunk.push(prompt.to_owned());
|
||||
current_chunk_ids.push(docid);
|
||||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
}
|
||||
chunks_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::take(&mut chunks),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
}
|
||||
}
|
||||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = embed_chunks(
|
||||
&embedder,
|
||||
vec![std::mem::take(&mut current_chunk)],
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
if let Some(embeds) = embeds.first() {
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(state_writer)
|
||||
}
|
||||
|
||||
fn embed_chunks(
|
||||
embedder: &Embedder,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>> {
|
||||
match embedder.embed_chunks(text_chunks, request_threads) {
|
||||
Ok(chunks) => Ok(chunks),
|
||||
Err(error) => {
|
||||
if let FaultSource::Bug = error.fault {
|
||||
Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
|
||||
error.into(),
|
||||
)))
|
||||
} else {
|
||||
let mut msg =
|
||||
format!(r"While embedding documents for embedder `{embedder_name}`: {error}");
|
||||
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
}
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,241 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::sorter_into_reader;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
///
|
||||
/// The first returned reader is the one for normal word_docids, and the second one is for
|
||||
/// exact_word_docids
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
let mut add_words = BTreeSet::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&del_words,
|
||||
&add_words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
del_words.clear();
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
let mut buffer = Vec::new();
|
||||
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
if delete_from_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
if add_in_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_into_sorter(
|
||||
document_id: DocumentId,
|
||||
fid: FieldId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let word_bytes = match eob {
|
||||
Left(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Right(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Both(word_bytes, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn docids_into_writers<W>(
|
||||
word: &str,
|
||||
deletions: &RoaringBitmap,
|
||||
additions: &RoaringBitmap,
|
||||
writer: &mut grenad::Writer<W>,
|
||||
) -> Result<()>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
if deletions == additions {
|
||||
// if the same value is deleted and added, do nothing.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Write each value in the same KvDelAdd before inserting it in the final writer.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
// deletions:
|
||||
if !deletions.is_empty() && !deletions.is_subset(additions) {
|
||||
obkv.insert(
|
||||
DelAdd::Deletion,
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
obkv.insert(
|
||||
DelAdd::Addition,
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,260 @@
|
|||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters, MergeFn,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted word pairs proximities and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if settings_diff.settings_update_only && !settings_diff.reindex_proximities() {
|
||||
let writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
return writer_into_reader(writer);
|
||||
}
|
||||
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut del_word_pair_proximity = BTreeMap::new();
|
||||
let mut add_word_pair_proximity = BTreeMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
del_word_pair_proximity.clear();
|
||||
add_word_pair_proximity.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
if !any_deletion {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
del_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !del_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
if !any_addition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
add_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !add_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
del?;
|
||||
add?;
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
for sorter in word_pair_proximity_docids_sorters {
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
///
|
||||
/// This list is used by the engine to calculate the documents containing words that are
|
||||
/// close to each other.
|
||||
fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut key_buffer = Vec::new();
|
||||
for eob in
|
||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||
d.cmp(a)
|
||||
})
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let ((w1, w2), prox) = match eob {
|
||||
Left(key_value) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Right(key_value) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Both(key_value, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
||||
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity
|
||||
.entry((head_word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,138 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words at positions and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut current_document_id: Option<u32> = None;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
if current_document_id.map_or(false, |id| document_id != id) {
|
||||
words_position_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
del_word_positions.clear();
|
||||
add_word_positions.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||
let position = bucketed_position(position);
|
||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
||||
let position = bucketed_position(position);
|
||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
words_position_into_sorter(
|
||||
document_id,
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO remove noop DelAdd OBKV
|
||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_position_docids_reader)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_position_into_sorter(
|
||||
document_id: DocumentId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let (position, word_bytes) = match eob {
|
||||
Left(key) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Right(key) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Both(key, _) => {
|
||||
// both values needs to be kept because it will be used in other extractors.
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
|
@ -0,0 +1,406 @@
|
|||
mod extract_docid_word_positions;
|
||||
mod extract_facet_number_docids;
|
||||
mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
embedders_configs.clone(),
|
||||
settings_diff.clone(),
|
||||
possible_embedding_mistakes.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
|| {
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
primary_key_id,
|
||||
settings_diff.clone(),
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.map(|result| {
|
||||
if let Ok((
|
||||
ref docid_word_positions_chunk,
|
||||
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
|
||||
)) = result
|
||||
{
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
);
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
),
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_docids,
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
);
|
||||
|
||||
original_pipeline_result.and(flattened_pipeline_result)
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
) where
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
GrenadParameters,
|
||||
&InnerIndexSettingsDiff,
|
||||
) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||
M: Send,
|
||||
{
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||
|
||||
REQUEST_THREADS.get_or_init(|| {
|
||||
ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()
|
||||
.unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
let embedders_configs = embedders_configs.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
match extract_vector_points(
|
||||
original_documents_chunk.clone(),
|
||||
indexer,
|
||||
&embedders_configs,
|
||||
&settings_diff,
|
||||
&possible_embedding_mistakes,
|
||||
) {
|
||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||
for ExtractedVectorPoints {
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: create a custom internal error
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents_ids
|
||||
/// - docid_word_positions
|
||||
/// - docid_fid_facet_numbers
|
||||
/// - docid_fid_facet_strings
|
||||
/// - docid_fid_facet_exists
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
if settings_diff.run_geo_indexing() {
|
||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
let settings_diff = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
let result =
|
||||
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
|
||||
let _ = match result {
|
||||
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let docid_word_positions_chunk = extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk,
|
||||
fid_docid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
)?;
|
||||
|
||||
// send fid_docid_facet_numbers_chunk to DB writer
|
||||
let fid_docid_facet_numbers_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
)));
|
||||
|
||||
// send fid_docid_facet_strings_chunk to DB writer
|
||||
let fid_docid_facet_strings_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||
|
||||
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
/// Wrapper around Mmap allowing to virtually clone grenad-chunks
|
||||
/// in a parallel process like the indexing.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClonableMmap {
|
||||
inner: Arc<Mmap>,
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ClonableMmap {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.inner.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Mmap> for ClonableMmap {
|
||||
fn from(inner: Mmap) -> ClonableMmap {
|
||||
ClonableMmap { inner: Arc::new(inner) }
|
||||
}
|
||||
}
|
||||
|
||||
pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;
|
|
@ -0,0 +1,214 @@
|
|||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
use heed::types::Bytes;
|
||||
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
/// This is something reasonable given the fact
|
||||
/// that there is one grenad sorter by thread.
|
||||
const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
|
||||
pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> grenad::Writer<BufWriter<R>> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(BufWriter::new(file))
|
||||
}
|
||||
|
||||
/// A helper function that creates a grenad sorter
|
||||
/// with the given parameters. The max memory is
|
||||
/// clamped to something reasonable.
|
||||
pub fn create_sorter(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MergeFn,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> grenad::Sorter<MergeFn> {
|
||||
let mut builder = grenad::Sorter::builder(merge);
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
builder.chunk_compression_level(level);
|
||||
}
|
||||
if let Some(nb_chunks) = max_nb_chunks {
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(true);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(
|
||||
writer: grenad::Writer<BufWriter<File>>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub unsafe fn as_cloneable_grenad(
|
||||
reader: &grenad::Reader<BufReader<File>>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.get_ref().get_ref();
|
||||
let mmap = memmap2::Mmap::map(file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for GrenadParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_memory: None,
|
||||
max_nb_chunks: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GrenadParameters {
|
||||
/// This function use the number of threads in the current threadpool to compute the value.
|
||||
///
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// otherwise, it will take the global number of threads.
|
||||
///
|
||||
/// The max memory cannot exceed a given reasonable value.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| {
|
||||
(max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator that outputs grenad readers of obkv documents
|
||||
/// with a maximum size of approximately `documents_chunks_size`.
|
||||
///
|
||||
/// The grenad obkv entries are composed of an incremental document id big-endian
|
||||
/// encoded as the key and an obkv object with an `u8` for the field as the key
|
||||
/// and a simple UTF-8 encoded string as the value.
|
||||
pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||
let mut continue_reading = true;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
let mut transposer = move || {
|
||||
if !continue_reading {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut current_chunk_size = 0u64;
|
||||
let mut obkv_documents = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue_reading = false;
|
||||
writer_into_reader(obkv_documents).map(Some)
|
||||
};
|
||||
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||
sorter: Sorter<MergeFn>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = merger_iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||
pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(Cow::Owned(Vec::new()))
|
||||
}
|
|
@ -0,0 +1,261 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
buffer.reserve(bitmap.serialized_size());
|
||||
bitmap.serialize_into(buffer)
|
||||
}
|
||||
|
||||
pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
|
||||
pub fn merge_two_del_add_obkvs(
|
||||
base: obkv::KvReaderU16<'_>,
|
||||
update: obkv::KvReaderU16<'_>,
|
||||
merge_additions: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
if merge_additions {
|
||||
writer.insert(k, v).unwrap()
|
||||
} else {
|
||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(v);
|
||||
|
||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
Both((k, base), (_, update)) => {
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(base);
|
||||
let update_reader = KvReaderDelAdd::new(update);
|
||||
|
||||
// keep newest deletion.
|
||||
if let Some(deletion) = update_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||
{
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
}
|
||||
|
||||
// keep base addition only if merge_additions is true.
|
||||
let base_addition =
|
||||
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||
// keep newest addition.
|
||||
// TODO use or_else
|
||||
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
}
|
||||
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
fn inner_merge_del_add_obkvs<'a>(
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
merge_additions: bool,
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// pop the newest operation from the list.
|
||||
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||
// keep the operation type for the returned value.
|
||||
let newest_operation_type = newest[0];
|
||||
|
||||
// treat the newest obkv as the starting point of the merge.
|
||||
let mut acc_operation_type = newest_operation_type;
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
break;
|
||||
}
|
||||
|
||||
let newest = obkv::KvReader::new(&acc);
|
||||
let oldest = obkv::KvReader::new(¤t[1..]);
|
||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator.
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc_operation_type = current[0];
|
||||
}
|
||||
|
||||
acc.insert(0, newest_operation_type);
|
||||
Ok(Cow::from(acc))
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
pub fn obkvs_merge_additions_and_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
}
|
||||
|
||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
}
|
||||
|
||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::new(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_btreeset_string<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
75
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
75
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
mod clonable_mmap;
|
||||
mod grenad_helpers;
|
||||
mod merge_functions;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, MergeFn,
|
||||
};
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
/// Converts an fst Stream into an HashSet of Strings.
|
||||
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut hashset = HashSet::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(value) = stream.next() {
|
||||
hashset.insert(value.to_owned());
|
||||
}
|
||||
hashset
|
||||
}
|
||||
|
||||
// Converts an fst Stream into a Vec of Strings.
|
||||
pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec<String>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut strings = Vec::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let s = std::str::from_utf8(word).unwrap();
|
||||
strings.push(s.to_owned());
|
||||
}
|
||||
strings
|
||||
}
|
3468
crates/milli/src/update/index_documents/mod.rs
Normal file
3468
crates/milli/src/update/index_documents/mod.rs
Normal file
File diff suppressed because it is too large
Load diff
86
crates/milli/src/update/index_documents/parallel.rs
Normal file
86
crates/milli/src/update/index_documents/parallel.rs
Normal file
|
@ -0,0 +1,86 @@
|
|||
use heed::types::Bytes;
|
||||
use heed::{Database, RoTxn};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
|
||||
|
||||
pub struct ImmutableObkvs<'t> {
|
||||
ids: RoaringBitmap,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
slices: Vec<&'t [u8]>,
|
||||
}
|
||||
|
||||
impl<'t> ImmutableObkvs<'t> {
|
||||
/// Creates the structure by fetching all the OBKVs
|
||||
/// and keeping the transaction making the pointers valid.
|
||||
pub fn new(
|
||||
rtxn: &'t RoTxn,
|
||||
documents_database: Database<BEU32, ObkvCodec>,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
subset: RoaringBitmap,
|
||||
) -> heed::Result<Self> {
|
||||
let mut slices = Vec::new();
|
||||
let documents_database = documents_database.remap_data_type::<Bytes>();
|
||||
for docid in &subset {
|
||||
let slice = documents_database.get(rtxn, &docid)?.unwrap();
|
||||
slices.push(slice);
|
||||
}
|
||||
|
||||
Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
|
||||
}
|
||||
|
||||
/// Returns the OBKVs identified by the given ID.
|
||||
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<KvReaderU16<'t>>> {
|
||||
match self
|
||||
.ids
|
||||
.rank(docid)
|
||||
.checked_sub(1)
|
||||
.and_then(|offset| self.slices.get(offset as usize))
|
||||
{
|
||||
Some(bytes) => Ok(Some(KvReaderU16::new(bytes))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the owned rhai::Map identified by the given ID.
|
||||
pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
|
||||
let obkv = match self.obkv(docid) {
|
||||
Ok(Some(obkv)) => obkv,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
let map: Result<rhai::Map> = all_keys
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
let name = self.fields_ids_map.name(id).ok_or(
|
||||
crate::error::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: id,
|
||||
process: "all_obkv_to_rhaimap",
|
||||
},
|
||||
)?;
|
||||
let value = serde_json::from_slice(value)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
Ok((name.into(), value))
|
||||
})
|
||||
.collect();
|
||||
|
||||
map.map(Some)
|
||||
}
|
||||
|
||||
pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
|
||||
let obkv = match self.obkv(docid) {
|
||||
Ok(Some(obkv)) => obkv,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Sync for ImmutableObkvs<'_> {}
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[]
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[2, ]
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
benoit [2, ]
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
||||
2 [21, ]
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
2 0 2.2 1 [21, ]
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
1 0 aquarium 1 [5, ]
|
||||
1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ]
|
||||
1 0 cartoon 1 [2, 7, 15, 17, ]
|
||||
1 0 colorfulness 1 [13, ]
|
||||
1 0 design 1 [2, 18, ]
|
||||
1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ]
|
||||
1 0 geometry 1 [19, ]
|
||||
1 0 letter 1 [1, ]
|
||||
1 0 outdoor 1 [4, ]
|
||||
1 0 painting 1 [3, ]
|
||||
1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ]
|
||||
2 0 design 1 [21, ]
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||
2 [21, ]
|
||||
36 [3, ]
|
||||
37 [4, ]
|
||||
38 [5, ]
|
||||
39 [6, ]
|
||||
40 [7, ]
|
||||
41 [8, ]
|
||||
42 [9, ]
|
||||
43 [10, ]
|
||||
44 [11, ]
|
||||
45 [12, ]
|
||||
46 [13, ]
|
||||
47 [14, ]
|
||||
5 [1, ]
|
||||
52 [15, ]
|
||||
57 [16, ]
|
||||
58 [17, ]
|
||||
68 [18, ]
|
||||
69 [19, ]
|
||||
7 [2, ]
|
||||
71 [21, ]
|
||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
aquarium [5, ]
|
||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||
cartoon [2, 7, 15, 17, ]
|
||||
colorfulness [13, ]
|
||||
design [2, 18, 21, ]
|
||||
drawing [3, 4, 5, 8, 10, 11, 16, ]
|
||||
geometry [19, ]
|
||||
letter [1, ]
|
||||
outdoor [4, ]
|
||||
painting [3, ]
|
||||
pattern [2, 3, 9, 10, 13, 14, 16, ]
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 1 36 [3, ]
|
||||
1 1 37 [4, ]
|
||||
1 1 38 [5, ]
|
||||
1 1 39 [6, ]
|
||||
1 1 40 [7, ]
|
||||
1 1 41 [8, ]
|
||||
1 1 42 [9, ]
|
||||
1 1 43 [10, ]
|
||||
1 1 44 [11, ]
|
||||
1 1 45 [12, ]
|
||||
1 1 46 [13, ]
|
||||
1 1 47 [14, ]
|
||||
1 1 5 [1, ]
|
||||
1 1 52 [15, ]
|
||||
1 1 57 [16, ]
|
||||
1 1 58 [17, ]
|
||||
1 1 68 [18, ]
|
||||
1 1 69 [19, ]
|
||||
1 1 7 [2, ]
|
||||
1 1 71 [21, ]
|
||||
1 2 2 [21, ]
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
3 0 48.9021 1 [19, ]
|
||||
3 0 49.9314 1 [17, ]
|
||||
3 0 50.1793 1 [15, ]
|
||||
3 0 50.2844 1 [14, ]
|
||||
3 0 50.3518 1 [13, ]
|
||||
3 0 50.4502 1 [12, ]
|
||||
3 0 50.6053 1 [8, ]
|
||||
3 0 50.6224 1 [3, ]
|
||||
3 0 50.6299 1 [0, ]
|
||||
3 0 50.6312 1 [2, ]
|
||||
3 0 50.6415 1 [1, ]
|
||||
3 0 50.7453 1 [7, ]
|
||||
3 0 50.8466 1 [10, ]
|
||||
3 0 51.0537 1 [9, ]
|
||||
4 0 2.271 1 [17, ]
|
||||
4 0 2.3708 1 [19, ]
|
||||
4 0 2.7637 1 [14, ]
|
||||
4 0 3.0569 1 [0, ]
|
||||
4 0 3.1106 1 [1, 2, ]
|
||||
4 0 3.1476 1 [3, ]
|
||||
4 0 3.2189 1 [15, ]
|
||||
4 0 3.2206 1 [7, ]
|
||||
4 0 3.3758 1 [8, ]
|
||||
4 0 3.5326 1 [13, ]
|
||||
4 0 3.6957 1 [9, ]
|
||||
4 0 3.9623 1 [12, ]
|
||||
4 0 4.337 1 [10, ]
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austin [0, ]
|
||||
blood [4, ]
|
||||
carroll [2, ]
|
||||
de [1, ]
|
||||
douglas [5, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [3, 4, ]
|
||||
jane [0, ]
|
||||
k [4, ]
|
||||
le [1, ]
|
||||
lewis [2, ]
|
||||
petit [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, 4, ]
|
||||
r [3, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
s [5, ]
|
||||
saint [1, ]
|
||||
the [3, 4, 5, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
1813 [0, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austen [0, ]
|
||||
blood [4, ]
|
||||
carroll [2, ]
|
||||
de [1, ]
|
||||
douglas [5, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [0, 3, 4, ]
|
||||
k [4, ]
|
||||
lewis [2, ]
|
||||
little [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, ]
|
||||
princess [4, ]
|
||||
r [3, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
s [5, ]
|
||||
saint [1, ]
|
||||
the [1, 3, 4, 5, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
1266
crates/milli/src/update/index_documents/transform.rs
Normal file
1266
crates/milli/src/update/index_documents/transform.rs
Normal file
File diff suppressed because it is too large
Load diff
896
crates/milli/src/update/index_documents/typed_chunk.rs
Normal file
896
crates/milli/src/update/index_documents/typed_chunk.rs
Normal file
|
@ -0,0 +1,896 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
|
||||
CursorClonableMmap,
|
||||
};
|
||||
use super::MergeFn;
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||
Result, SerializationError, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// This struct accumulates and group the TypedChunks
|
||||
/// and is able to give the biggest accumulated group to index them all together
|
||||
/// with a merger.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct ChunkAccumulator {
|
||||
inner: Vec<Vec<TypedChunk>>,
|
||||
}
|
||||
|
||||
impl ChunkAccumulator {
|
||||
pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
|
||||
match self.inner.iter().max_by_key(|v| v.len()) {
|
||||
Some(left) => {
|
||||
let position = self.inner.iter().position(|right| left.len() == right.len());
|
||||
position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, chunk: TypedChunk) {
|
||||
match self
|
||||
.inner
|
||||
.iter()
|
||||
.position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right)))
|
||||
{
|
||||
Some(position) => {
|
||||
let v = self.inner.get_mut(position).unwrap();
|
||||
v.push(chunk);
|
||||
}
|
||||
None => self.inner.push(vec![chunk]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||
Documents(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
|
||||
WordDocids {
|
||||
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
word_fid_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
},
|
||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints {
|
||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
||||
expected_dimension: usize,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
embedder_name: String,
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
},
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
fn mergeable_with(&self, other: &Self) -> bool {
|
||||
use TypedChunk::*;
|
||||
match (self, other) {
|
||||
(FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
|
||||
| (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
|
||||
| (Documents(_), Documents(_))
|
||||
| (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
|
||||
| (WordDocids { .. }, WordDocids { .. })
|
||||
| (WordPositionDocids(_), WordPositionDocids(_))
|
||||
| (WordPairProximityDocids(_), WordPairProximityDocids(_))
|
||||
| (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
|
||||
| (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
|
||||
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
||||
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
||||
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
||||
| (GeoPoints(_), GeoPoints(_)) => true,
|
||||
(
|
||||
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
||||
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
||||
) => left == right && left_dim == right_dim,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||
/// Return new documents seen.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
typed_chunks: Vec<TypedChunk>,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunks[0] {
|
||||
TypedChunk::Documents(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
||||
let _entered = span.enter();
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(wtxn)?;
|
||||
let vectors_fid =
|
||||
fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||
|
||||
let mut docids = index.documents_ids(wtxn)?;
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
||||
let embedders: BTreeSet<_> = index
|
||||
.embedding_configs(wtxn)?
|
||||
.into_iter()
|
||||
.map(|IndexEmbeddingConfig { name, .. }| name)
|
||||
.collect();
|
||||
let mut vectors_buffer = Vec::new();
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: KvReader<'_, FieldId> = KvReader::new(reader);
|
||||
|
||||
let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
|
||||
let docid = DocumentId::from_be_bytes(document_id_bytes);
|
||||
let external_id = std::str::from_utf8(external_id_bytes)?;
|
||||
|
||||
for (field_id, value) in reader.iter() {
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
let addition = if vectors_fid == Some(field_id) {
|
||||
'vectors: {
|
||||
vectors_buffer.clear();
|
||||
let Ok(mut vectors) =
|
||||
crate::vector::parsed_vectors::ParsedVectors::from_bytes(
|
||||
addition,
|
||||
)
|
||||
else {
|
||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||
break 'vectors Some(addition);
|
||||
};
|
||||
vectors.retain_not_embedded_vectors(&embedders);
|
||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||
if vectors.is_empty() {
|
||||
// skip writing empty `_vectors` map
|
||||
break 'vectors None;
|
||||
}
|
||||
|
||||
serde_json::to_writer(&mut vectors_buffer, &vectors)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
Some(vectors_buffer.as_slice())
|
||||
}
|
||||
} else {
|
||||
Some(addition)
|
||||
};
|
||||
|
||||
if let Some(addition) = addition {
|
||||
writer.insert(field_id, addition)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let db = index.documents.remap_data_type::<Bytes>();
|
||||
|
||||
if !writer.is_empty() {
|
||||
db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Create,
|
||||
});
|
||||
docids.insert(docid);
|
||||
} else {
|
||||
db.delete(wtxn, &docid)?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Delete,
|
||||
});
|
||||
docids.remove(docid);
|
||||
}
|
||||
}
|
||||
let external_documents_docids = index.external_documents_ids();
|
||||
external_documents_docids.apply(wtxn, operations)?;
|
||||
index.put_documents_ids(wtxn, &docids)?;
|
||||
}
|
||||
TypedChunk::FieldIdWordCountDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordDocids { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut exact_word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut word_fid_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
let clonable_exact_word_docids =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_docids_merger,
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_merger = exact_word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
exact_word_docids_merger,
|
||||
&index.exact_word_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_merger = word_fid_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_fid_docids_merger,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
let fst_merger = fst_merger_builder.build();
|
||||
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
|
||||
let db_fst = index.words_fst(wtxn)?;
|
||||
|
||||
// merge new fst with database fst
|
||||
let union_stream = fst.op().add(db_fst.stream()).union();
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(union_stream)?;
|
||||
let fst = builder.into_set();
|
||||
index.put_words_fst(wtxn, &fst)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPositionDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_position_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_number_docids.len();
|
||||
builder.push(facet_id_number_docids.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut normalized_facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetStringDocids((
|
||||
facet_id_string_docids,
|
||||
normalized_facet_id_string_docids,
|
||||
)) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_string_docids.len();
|
||||
facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
|
||||
normalized_facet_id_string_builder
|
||||
.push(normalized_facet_id_string_docids.into_cursor()?);
|
||||
}
|
||||
let facet_id_string_merger = facet_id_string_builder.build();
|
||||
let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(
|
||||
index,
|
||||
FacetType::String,
|
||||
facet_id_string_merger,
|
||||
Some(normalized_facet_id_string_merger),
|
||||
data_size,
|
||||
);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
if settings_diff.only_additional_fields.is_some() {
|
||||
write_proximity_entries_into_database_additional_searchables(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
)?;
|
||||
} else {
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
}
|
||||
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_numbers =
|
||||
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_numbers.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_numbers.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetStrings(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_strings =
|
||||
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_strings.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_strings.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::GeoPoints(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
let deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.remove(&geopoint);
|
||||
geo_faceted_docids.remove(docid);
|
||||
}
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.insert(geopoint);
|
||||
geo_faceted_docids.insert(docid);
|
||||
}
|
||||
}
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut add_to_user_provided = RoaringBitmap::new();
|
||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
||||
let mut params = None;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
add_to_user_provided: aud,
|
||||
remove_from_user_provided: rud,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
params = Some((expected_dimension, embedder_name));
|
||||
|
||||
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||
if let Some(embeddings) = embeddings {
|
||||
embeddings_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
add_to_user_provided |= aud;
|
||||
remove_from_user_provided |= rud;
|
||||
}
|
||||
|
||||
// typed chunks has always at least 1 chunk.
|
||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||
|
||||
let mut embedding_configs = index.embedding_configs(wtxn)?;
|
||||
let index_embedder_config = embedding_configs
|
||||
.iter_mut()
|
||||
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
|
||||
.unwrap();
|
||||
index_embedder_config.user_provided -= remove_from_user_provided;
|
||||
index_embedder_config.user_provided |= add_to_user_provided;
|
||||
|
||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
||||
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
let binary_quantized = settings_diff
|
||||
.old
|
||||
.embedding_configs
|
||||
.get(&embedder_name)
|
||||
.map_or(false, |conf| conf.2);
|
||||
// FIXME: allow customizing distance
|
||||
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
||||
.map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized))
|
||||
.collect();
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, _)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
for writer in &writers {
|
||||
// Uses invariant: vectors are packed in the first writers.
|
||||
if !writer.del_item(wtxn, expected_dimension, docid)? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
let merger = embeddings_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
|
||||
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
||||
let external_docid = if let Ok(Some(Ok(index))) = index
|
||||
.external_id_of(wtxn, std::iter::once(docid))
|
||||
.map(|it| it.into_iter().next())
|
||||
{
|
||||
index
|
||||
} else {
|
||||
format!("internal docid={docid}")
|
||||
};
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
external_docid,
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||
writer.add_item(wtxn, expected_dimension, docid, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
let merger = manual_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
|
||||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
let mut deleted_index = None;
|
||||
for (index, writer) in writers.iter().enumerate() {
|
||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
||||
// uses invariant: vectors are packed in the first writers.
|
||||
break;
|
||||
};
|
||||
if candidate == vector {
|
||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
||||
deleted_index = Some(index);
|
||||
}
|
||||
}
|
||||
|
||||
// 🥲 enforce invariant: vectors are packed in the first writers.
|
||||
if let Some(deleted_index) = deleted_index {
|
||||
let mut last_index_with_a_vector = None;
|
||||
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
|
||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
||||
break;
|
||||
};
|
||||
last_index_with_a_vector = Some((index, candidate));
|
||||
}
|
||||
if let Some((last_index, vector)) = last_index_with_a_vector {
|
||||
// unwrap: computed the index from the list of writers
|
||||
let writer = writers.get(last_index).unwrap();
|
||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
||||
writers.get(deleted_index).unwrap().add_item(
|
||||
wtxn,
|
||||
expected_dimension,
|
||||
docid,
|
||||
&vector,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
let vector = pod_collect_to_vec(value);
|
||||
|
||||
// overflow was detected during vector extraction.
|
||||
for writer in &writers {
|
||||
if !writer.contains_item(wtxn, expected_dimension, docid)? {
|
||||
writer.add_item(wtxn, expected_dimension, docid, &vector)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((RoaringBitmap::new(), is_merged_database))
|
||||
}
|
||||
|
||||
/// Converts the latitude and longitude back to an xyz GeoPoint.
|
||||
fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||
let xyz_point = lat_lng_to_xyz(&point);
|
||||
GeoPoint::new(xyz_point, (docid, point))
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst(
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
while let Some((k, _)) = iter.next()? {
|
||||
builder.insert(k)?;
|
||||
}
|
||||
|
||||
Ok(builder.into_set())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
merger: Merger<R, MergeFn>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Akin to the `write_entries_into_database` function but specialized
|
||||
/// for the case when we only index additional searchable fields only.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_proximity_entries_into_database_additional_searchables<R>(
|
||||
merger: Merger<R, MergeFn>,
|
||||
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
let (proximity_to_insert, word1, word2) =
|
||||
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
Some(value) => {
|
||||
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
||||
}
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let mut data_to_remove = RoaringBitmap::new();
|
||||
for prox in 1..(MAX_DISTANCE as u8) {
|
||||
let key = (prox, word1, word2);
|
||||
let database_value = database.get(wtxn, &key)?.unwrap_or_default();
|
||||
let value = if prox == proximity_to_insert {
|
||||
// Proximity that should be changed.
|
||||
// Union values and remove lower proximity data
|
||||
(&database_value | &data_to_insert) - &data_to_remove
|
||||
} else {
|
||||
// Remove lower proximity data
|
||||
&database_value - &data_to_remove
|
||||
};
|
||||
|
||||
// add the current data in data_to_remove for the next proximities
|
||||
data_to_remove |= &value;
|
||||
|
||||
if database_value != value {
|
||||
database.put(wtxn, &key, &value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
32
crates/milli/src/update/indexer_config.rs
Normal file
32
crates/milli/src/update/indexer_config.rs
Normal file
|
@ -0,0 +1,32 @@
|
|||
use grenad::CompressionType;
|
||||
|
||||
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IndexerConfig {
|
||||
pub log_every_n: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
pub documents_chunk_size: Option<usize>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub thread_pool: Option<ThreadPoolNoAbort>,
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
}
|
||||
|
||||
impl Default for IndexerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
log_every_n: None,
|
||||
max_nb_chunks: None,
|
||||
documents_chunk_size: None,
|
||||
max_memory: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
thread_pool: None,
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: false,
|
||||
}
|
||||
}
|
||||
}
|
26
crates/milli/src/update/mod.rs
Normal file
26
crates/milli/src/update/mod.rs
Normal file
|
@ -0,0 +1,26 @@
|
|||
pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::{
|
||||
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
|
||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_documents_ids;
|
||||
mod clear_documents;
|
||||
pub(crate) mod del_add;
|
||||
pub(crate) mod facet;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
mod words_prefix_integer_docids;
|
||||
mod words_prefixes_fst;
|
2776
crates/milli/src/update/settings.rs
Normal file
2776
crates/milli/src/update/settings.rs
Normal file
File diff suppressed because it is too large
Load diff
35
crates/milli/src/update/update_step.rs
Normal file
35
crates/milli/src/update/update_step.rs
Normal file
|
@ -0,0 +1,35 @@
|
|||
use UpdateIndexingStep::*;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum UpdateIndexingStep {
|
||||
/// Remap document addition fields the one present in the database, adding new fields in to the
|
||||
/// schema on the go.
|
||||
RemapDocumentAddition { documents_seen: usize },
|
||||
|
||||
/// This step check the external document id, computes the internal ids and merge
|
||||
/// the documents that are already present in the database.
|
||||
ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize },
|
||||
|
||||
/// Extract the documents words using the tokenizer and compute the documents
|
||||
/// facets. Stores those words, facets and documents ids on disk.
|
||||
IndexDocuments { documents_seen: usize, total_documents: usize },
|
||||
|
||||
/// Merge the previously extracted data (words and facets) into the final LMDB database.
|
||||
/// These extracted data are split into multiple databases.
|
||||
MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize },
|
||||
}
|
||||
|
||||
impl UpdateIndexingStep {
|
||||
pub const fn step(&self) -> usize {
|
||||
match self {
|
||||
RemapDocumentAddition { .. } => 0,
|
||||
ComputeIdsAndMergeDocuments { .. } => 1,
|
||||
IndexDocuments { .. } => 2,
|
||||
MergeDataIntoFinalDatabase { .. } => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn number_of_steps(&self) -> usize {
|
||||
4
|
||||
}
|
||||
}
|
153
crates/milli/src/update/word_prefix_docids.rs
Normal file
153
crates/milli/src/update/word_prefix_docids.rs
Normal file
|
@ -0,0 +1,153 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::{Bytes, Str};
|
||||
use heed::Database;
|
||||
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
write_sorter_into_database, CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
pub struct WordPrefixDocids<'t, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i>,
|
||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i>,
|
||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
) -> WordPrefixDocids<'t, 'i> {
|
||||
WordPrefixDocids {
|
||||
wtxn,
|
||||
word_docids,
|
||||
word_prefix_docids,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip_all,
|
||||
target = "indexing::prefix",
|
||||
name = "word_prefix_docids"
|
||||
)]
|
||||
pub fn execute(
|
||||
self,
|
||||
new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?;
|
||||
while let Some((word, data)) = new_word_docids_iter.next()? {
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
|
||||
_otherwise => {
|
||||
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
|
||||
common_prefix_fst_words
|
||||
.iter()
|
||||
.find(|prefixes| word.starts_with(prefixes[0].as_bytes()))
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(prefixes) = current_prefixes {
|
||||
for prefix in prefixes.iter() {
|
||||
if word.starts_with(prefix.as_bytes()) {
|
||||
match prefixes_cache.get_mut(prefix.as_bytes()) {
|
||||
Some(value) => value.push(data.to_owned()),
|
||||
None => {
|
||||
prefixes_cache
|
||||
.insert(prefix.clone().into(), vec![data.to_owned()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
|
||||
}
|
||||
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
let db = self.word_docids.remap_data_type::<Bytes>();
|
||||
let mut buffer = Vec::new();
|
||||
for prefix in new_prefix_fst_words {
|
||||
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||
let (_word, data) = result?;
|
||||
buffer.clear();
|
||||
let mut writer = KvWriterDelAdd::new(&mut buffer);
|
||||
writer.insert(DelAdd::Addition, data)?;
|
||||
|
||||
prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We remove all the entries that are no more required in this word prefix docids database.
|
||||
let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data();
|
||||
while let Some((prefix, _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?;
|
||||
|
||||
// We finally write the word prefix docids into the LMDB database.
|
||||
write_sorter_into_database(
|
||||
prefix_docids_sorter,
|
||||
&self.word_prefix_docids,
|
||||
self.wtxn,
|
||||
database_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_prefixes_in_sorter(
|
||||
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
||||
sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
for (key, data_slices) in prefixes.drain() {
|
||||
for data in data_slices {
|
||||
if valid_lmdb_key(&key) {
|
||||
sorter.insert(&key, data)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
188
crates/milli/src/update/words_prefix_integer_docids.rs
Normal file
188
crates/milli/src/update/words_prefix_integer_docids.rs
Normal file
|
@ -0,0 +1,188 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
use std::str;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Database};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
write_sorter_into_database, CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
pub struct WordPrefixIntegerDocids<'t, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i>,
|
||||
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i>,
|
||||
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
) -> WordPrefixIntegerDocids<'t, 'i> {
|
||||
WordPrefixIntegerDocids {
|
||||
wtxn,
|
||||
prefix_database,
|
||||
word_database,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip_all,
|
||||
target = "indexing::prefix",
|
||||
name = "words_prefix_integer_docids"
|
||||
)]
|
||||
pub fn execute(
|
||||
self,
|
||||
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
debug!("Computing and writing the word levels integers docids into LMDB on disk...");
|
||||
|
||||
let mut prefix_integer_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
// We fetch all the new common prefixes between the previous and new prefix fst.
|
||||
let mut buffer = Vec::new();
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
let mut new_word_integer_docids_iter =
|
||||
new_word_integer_docids.into_stream_merger_iter()?;
|
||||
while let Some((key, data)) = new_word_integer_docids_iter.next()? {
|
||||
let (word, pos) =
|
||||
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
|
||||
_otherwise => {
|
||||
write_prefixes_in_sorter(
|
||||
&mut prefixes_cache,
|
||||
&mut prefix_integer_docids_sorter,
|
||||
)?;
|
||||
common_prefix_fst_words
|
||||
.iter()
|
||||
.find(|prefixes| word.starts_with(&prefixes[0]))
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(prefixes) = current_prefixes {
|
||||
for prefix in prefixes.iter() {
|
||||
if word.starts_with(prefix) {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(prefix.as_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&pos.to_be_bytes());
|
||||
match prefixes_cache.get_mut(&buffer) {
|
||||
Some(value) => value.push(data.to_owned()),
|
||||
None => {
|
||||
prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_integer_docids_sorter)?;
|
||||
}
|
||||
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
let db = self.word_database.remap_data_type::<Bytes>();
|
||||
let mut buffer = Vec::new();
|
||||
for prefix_bytes in new_prefix_fst_words {
|
||||
let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
|
||||
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
|
||||
})?;
|
||||
|
||||
// iter over all lines of the DB where the key is prefixed by the current prefix.
|
||||
let iter = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in iter {
|
||||
let ((word, pos), data) = result?;
|
||||
if word.starts_with(prefix) {
|
||||
let key = (prefix, pos);
|
||||
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut writer = KvWriterDelAdd::new(&mut buffer);
|
||||
writer.insert(DelAdd::Addition, data)?;
|
||||
prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We remove all the entries that are no more required in this word prefix integer
|
||||
// docids database.
|
||||
// We also avoid iterating over the whole `word_prefix_integer_docids` database if we know in
|
||||
// advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
|
||||
// will always be false (i.e. if `del_prefix_fst_words` is empty).
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter = self.prefix_database.iter_mut(self.wtxn)?.lazily_decode_data();
|
||||
while let Some(((prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
drop(iter);
|
||||
}
|
||||
|
||||
let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
|
||||
|
||||
// We finally write all the word prefix integer docids into the LMDB database.
|
||||
write_sorter_into_database(
|
||||
prefix_integer_docids_sorter,
|
||||
&self.prefix_database,
|
||||
self.wtxn,
|
||||
database_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_prefixes_in_sorter(
|
||||
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
||||
sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
// TODO: Merge before insertion.
|
||||
for (key, data_slices) in prefixes.drain() {
|
||||
for data in data_slices {
|
||||
if valid_lmdb_key(&key) {
|
||||
sorter.insert(&key, data)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
99
crates/milli/src/update/words_prefixes_fst.rs
Normal file
99
crates/milli/src/update/words_prefixes_fst.rs
Normal file
|
@ -0,0 +1,99 @@
|
|||
use std::iter::{repeat_with, FromIterator};
|
||||
use std::str;
|
||||
|
||||
use fst::{SetBuilder, Streamer};
|
||||
use heed::RwTxn;
|
||||
|
||||
use crate::{Index, Result, SmallString32};
|
||||
|
||||
pub struct WordsPrefixesFst<'t, 'i> {
|
||||
wtxn: &'t mut RwTxn<'i>,
|
||||
index: &'i Index,
|
||||
threshold: u32,
|
||||
max_prefix_length: usize,
|
||||
}
|
||||
|
||||
impl<'t, 'i> WordsPrefixesFst<'t, 'i> {
|
||||
pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> WordsPrefixesFst<'t, 'i> {
|
||||
WordsPrefixesFst { wtxn, index, threshold: 100, max_prefix_length: 4 }
|
||||
}
|
||||
|
||||
/// Set the number of words required to make a prefix be part of the words prefixes
|
||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||
/// dictionary, therefore this prefix is added to the words prefixes datastructures.
|
||||
///
|
||||
/// Default value is 100. This value must be higher than 50 and will be clamped
|
||||
/// to this bound otherwise.
|
||||
pub fn threshold(&mut self, value: u32) -> &mut Self {
|
||||
self.threshold = value.max(50);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the maximum length of prefixes in bytes.
|
||||
///
|
||||
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
||||
/// to these bounds, otherwise.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value.clamp(1, 25);
|
||||
self
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip_all,
|
||||
target = "indexing::prefix",
|
||||
name = "words_prefix_fst"
|
||||
)]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
let words_fst = self.index.words_fst(self.wtxn)?;
|
||||
|
||||
let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
|
||||
let mut current_prefix_count = vec![0; self.max_prefix_length];
|
||||
let mut builders =
|
||||
repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect::<Vec<_>>();
|
||||
|
||||
let mut stream = words_fst.stream();
|
||||
while let Some(bytes) = stream.next() {
|
||||
for n in 0..self.max_prefix_length {
|
||||
let current_prefix = &mut current_prefix[n];
|
||||
let current_prefix_count = &mut current_prefix_count[n];
|
||||
let builder = &mut builders[n];
|
||||
|
||||
// We try to get the first n bytes out of this string but we only want
|
||||
// to split at valid characters bounds. If we try to split in the middle of
|
||||
// a character we ignore this word and go to the next one.
|
||||
let word = str::from_utf8(bytes)?;
|
||||
let prefix = match word.get(..=n) {
|
||||
Some(prefix) => prefix,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// This is the first iteration of the loop,
|
||||
// or the current word doesn't starts with the current prefix.
|
||||
if *current_prefix_count == 0 || prefix != current_prefix.as_str() {
|
||||
*current_prefix = SmallString32::from(prefix);
|
||||
*current_prefix_count = 0;
|
||||
}
|
||||
|
||||
*current_prefix_count += 1;
|
||||
|
||||
// There is enough words corresponding to this prefix to add it to the cache.
|
||||
if *current_prefix_count >= self.threshold {
|
||||
builder.insert(prefix)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We merge all of the previously computed prefixes into on final set.
|
||||
let prefix_fsts: Vec<_> = builders.into_iter().map(|sb| sb.into_set()).collect();
|
||||
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(op.r#union())?;
|
||||
let prefix_fst = builder.into_set();
|
||||
|
||||
// Set the words prefixes FST in the dtabase.
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue