mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Compute chunk size based on the input data size ant the number of indexing threads
This commit is contained in:
parent
023c2d755f
commit
be1b054b05
@ -1,7 +1,7 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::{CompressionType, Merger};
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
|
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -14,6 +14,7 @@ use crate::heed_codec::facet::{
|
|||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||||
|
use crate::update::MergeFn;
|
||||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||||
@ -28,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> {
|
|||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
// None if level 0 does not need to be updated
|
// None if level 0 does not need to be updated
|
||||||
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
@ -36,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<BufReader<File>>,
|
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
@ -89,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
pub delta_data: Option<grenad::Reader<R>>,
|
pub delta_data: Option<Merger<R, MergeFn>>,
|
||||||
pub group_size: u8,
|
pub group_size: u8,
|
||||||
pub min_level_size: u8,
|
pub min_level_size: u8,
|
||||||
}
|
}
|
||||||
@ -129,8 +130,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
if self.db.is_empty(wtxn)? {
|
if self.db.is_empty(wtxn)? {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||||
let mut cursor = delta_data.into_cursor()?;
|
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -154,8 +155,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let database = self.db.remap_types::<Bytes, Bytes>();
|
let database = self.db.remap_types::<Bytes, Bytes>();
|
||||||
|
|
||||||
let mut cursor = delta_data.into_cursor()?;
|
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
|
use grenad::Merger;
|
||||||
use heed::types::{Bytes, DecodeIgnore};
|
use heed::types::{Bytes, DecodeIgnore};
|
||||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
@ -14,6 +15,7 @@ use crate::heed_codec::BytesRefCodec;
|
|||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::update::index_documents::valid_lmdb_key;
|
use crate::update::index_documents::valid_lmdb_key;
|
||||||
|
use crate::update::MergeFn;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||||
|
|
||||||
enum InsertionResult {
|
enum InsertionResult {
|
||||||
@ -31,14 +33,14 @@ enum DeletionResult {
|
|||||||
/// `facet_id_(string/f64)_docids` databases.
|
/// `facet_id_(string/f64)_docids` databases.
|
||||||
pub struct FacetsUpdateIncremental {
|
pub struct FacetsUpdateIncremental {
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
delta_data: grenad::Reader<BufReader<File>>,
|
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetsUpdateIncremental {
|
impl FacetsUpdateIncremental {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<BufReader<File>>,
|
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
@ -61,16 +63,18 @@ impl FacetsUpdateIncremental {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[logging_timer::time("FacetsUpdateIncremental::{}")]
|
||||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||||
let mut cursor = self.delta_data.into_cursor()?;
|
let mut iter = self.delta_data.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
|
while let Some((key, value)) = iter.next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key)
|
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key)
|
||||||
.map_err(heed::Error::Encoding)?;
|
.map_err(heed::Error::Encoding)?;
|
||||||
let value = KvReader::new(value);
|
let value = KvReader::new(value);
|
||||||
|
|
||||||
let docids_to_delete = value
|
let docids_to_delete = value
|
||||||
.get(DelAdd::Deletion)
|
.get(DelAdd::Deletion)
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||||
|
@ -79,12 +79,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
use grenad::Merger;
|
||||||
use grenad::{CompressionType, SortAlgorithm};
|
use heed::types::{Bytes, DecodeIgnore};
|
||||||
use heed::types::{Bytes, DecodeIgnore, SerdeJson};
|
|
||||||
use heed::BytesEncode;
|
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@ -93,9 +90,9 @@ use super::FacetsUpdateBulk;
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::update::index_documents::create_sorter;
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::merge_btreeset_string;
|
use crate::update::MergeFn;
|
||||||
use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH};
|
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod incremental;
|
pub mod incremental;
|
||||||
@ -108,16 +105,20 @@ pub struct FacetsUpdate<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<BufReader<File>>,
|
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||||
|
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
|
data_size: u64,
|
||||||
}
|
}
|
||||||
impl<'i> FacetsUpdate<'i> {
|
impl<'i> FacetsUpdate<'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<BufReader<File>>,
|
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||||
|
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||||
|
data_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let database = match facet_type {
|
let database = match facet_type {
|
||||||
FacetType::String => {
|
FacetType::String => {
|
||||||
@ -135,18 +136,20 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
delta_data,
|
delta_data,
|
||||||
|
normalized_delta_data,
|
||||||
|
data_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
if self.delta_data.is_empty() {
|
if self.data_size == 0 {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
// See self::comparison_bench::benchmark_facet_indexing
|
// See self::comparison_bench::benchmark_facet_indexing
|
||||||
if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
|
if self.data_size >= (self.database.len(wtxn)? / 50) {
|
||||||
let field_ids =
|
let field_ids =
|
||||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||||
let bulk_update = FacetsUpdateBulk::new(
|
let bulk_update = FacetsUpdateBulk::new(
|
||||||
@ -170,92 +173,94 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
incremental_update.execute(wtxn)?;
|
incremental_update.execute(wtxn)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We clear the list of normalized-for-search facets
|
if let Some(normalized_delta_data) = self.normalized_delta_data {
|
||||||
// and the previous FSTs to compute everything from scratch
|
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
|
||||||
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
while let Some((key_bytes, delta_bytes)) = iter.next()? {
|
||||||
self.index.facet_id_string_fst.clear(wtxn)?;
|
let deladd_reader = KvReaderDelAdd::new(delta_bytes);
|
||||||
|
|
||||||
// As we can't use the same write transaction to read and write in two different databases
|
let database_set = self
|
||||||
// we must create a temporary sorter that we will write into LMDB afterward.
|
.index
|
||||||
// As multiple unnormalized facet values can become the same normalized facet value
|
.facet_id_normalized_string_strings
|
||||||
// we must merge them together.
|
.remap_key_type::<Bytes>()
|
||||||
let mut sorter = create_sorter(
|
.get(wtxn, &key_bytes)?
|
||||||
SortAlgorithm::Unstable,
|
.unwrap_or_default();
|
||||||
merge_btreeset_string,
|
|
||||||
CompressionType::None,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
|
|
||||||
// We iterate on the list of original, semi-normalized, facet values
|
let add_set = deladd_reader
|
||||||
// and normalize them for search, inserting them in LMDB in any given order.
|
.get(DelAdd::Addition)
|
||||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||||
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
.unwrap_or_default();
|
||||||
for result in database.iter(wtxn)? {
|
|
||||||
let (facet_group_key, ()) = result?;
|
let del_set = match deladd_reader
|
||||||
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
.get(DelAdd::Deletion)
|
||||||
let mut normalized_facet = left_bound.normalize(&options);
|
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||||
let normalized_truncated_facet: String;
|
{
|
||||||
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
|
Some(del_set) => {
|
||||||
normalized_truncated_facet = normalized_facet
|
let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
|
||||||
.char_indices()
|
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
let mut set = BTreeSet::new();
|
||||||
.map(|(_, c)| c)
|
for facet in del_set {
|
||||||
.collect();
|
let key =
|
||||||
normalized_facet = normalized_truncated_facet.into();
|
FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
|
||||||
|
// Check if the referenced value doesn't exist anymore before deleting it.
|
||||||
|
if self.index.facet_id_string_docids.get(wtxn, &key)?.is_none() {
|
||||||
|
set.insert(facet);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
set
|
||||||
|
}
|
||||||
|
None => BTreeSet::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let set: BTreeSet<_> =
|
||||||
|
database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
|
||||||
|
|
||||||
|
if set.is_empty() {
|
||||||
|
self.index
|
||||||
|
.facet_id_normalized_string_strings
|
||||||
|
.remap_key_type::<Bytes>()
|
||||||
|
.delete(wtxn, key_bytes)?;
|
||||||
|
} else {
|
||||||
|
self.index
|
||||||
|
.facet_id_normalized_string_strings
|
||||||
|
.remap_key_type::<Bytes>()
|
||||||
|
.put(wtxn, key_bytes, &set)?;
|
||||||
}
|
}
|
||||||
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
}
|
||||||
let key = (field_id, normalized_facet.as_ref());
|
|
||||||
let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
// We clear the FST of normalized-for-search to compute everything from scratch.
|
||||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||||
sorter.insert(key, val)?;
|
// We compute one FST by string facet
|
||||||
|
let mut text_fsts = vec![];
|
||||||
|
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||||
|
let database =
|
||||||
|
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||||
|
for result in database.iter(wtxn)? {
|
||||||
|
let ((field_id, normalized_facet), _) = result?;
|
||||||
|
current_fst = match current_fst.take() {
|
||||||
|
Some((fid, fst_builder)) if fid != field_id => {
|
||||||
|
let fst = fst_builder.into_set();
|
||||||
|
text_fsts.push((fid, fst));
|
||||||
|
Some((field_id, fst::SetBuilder::memory()))
|
||||||
|
}
|
||||||
|
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||||
|
None => Some((field_id, fst::SetBuilder::memory())),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||||
|
fst_builder.insert(normalized_facet)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((field_id, fst_builder)) = current_fst {
|
||||||
|
let fst = fst_builder.into_set();
|
||||||
|
text_fsts.push((field_id, fst));
|
||||||
|
}
|
||||||
|
|
||||||
|
// We write those FSTs in LMDB now
|
||||||
|
for (field_id, fst) in text_fsts {
|
||||||
|
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// In this loop we don't need to take care of merging bitmaps
|
|
||||||
// as the grenad sorter already merged them for us.
|
|
||||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
|
||||||
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
|
|
||||||
self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
|
|
||||||
wtxn,
|
|
||||||
key_bytes,
|
|
||||||
btreeset_bytes,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We compute one FST by string facet
|
|
||||||
let mut text_fsts = vec![];
|
|
||||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
|
||||||
let database =
|
|
||||||
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
|
||||||
for result in database.iter(wtxn)? {
|
|
||||||
let ((field_id, normalized_facet), _) = result?;
|
|
||||||
current_fst = match current_fst.take() {
|
|
||||||
Some((fid, fst_builder)) if fid != field_id => {
|
|
||||||
let fst = fst_builder.into_set();
|
|
||||||
text_fsts.push((fid, fst));
|
|
||||||
Some((field_id, fst::SetBuilder::memory()))
|
|
||||||
}
|
|
||||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
|
||||||
None => Some((field_id, fst::SetBuilder::memory())),
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
|
||||||
fst_builder.insert(normalized_facet)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((field_id, fst_builder)) = current_fst {
|
|
||||||
let fst = fst_builder.into_set();
|
|
||||||
text_fsts.push((field_id, fst));
|
|
||||||
}
|
|
||||||
|
|
||||||
// We write those FSTs in LMDB now
|
|
||||||
for (field_id, fst) in text_fsts {
|
|
||||||
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -268,6 +273,7 @@ pub(crate) mod test_helpers {
|
|||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
|
||||||
|
use grenad::MergerBuilder;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
|
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -280,7 +286,8 @@ pub(crate) mod test_helpers {
|
|||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::snapshot_tests::display_bitmap;
|
use crate::snapshot_tests::display_bitmap;
|
||||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::FacetsUpdateIncrementalInner;
|
use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
|
||||||
|
use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
|
||||||
use crate::CboRoaringBitmapCodec;
|
use crate::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
/// Utility function to generate a string whose position in a lexicographically
|
/// Utility function to generate a string whose position in a lexicographically
|
||||||
@ -463,10 +470,13 @@ pub(crate) mod test_helpers {
|
|||||||
}
|
}
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||||
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
builder.push(reader.into_cursor().unwrap());
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
let update = FacetsUpdateBulkInner {
|
let update = FacetsUpdateBulkInner {
|
||||||
db: self.content,
|
db: self.content,
|
||||||
delta_data: Some(reader),
|
delta_data: Some(merger),
|
||||||
group_size: self.group_size.get(),
|
group_size: self.group_size.get(),
|
||||||
min_level_size: self.min_level_size.get(),
|
min_level_size: self.min_level_size.get(),
|
||||||
};
|
};
|
||||||
|
@ -26,7 +26,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
@ -181,11 +181,11 @@ fn searchable_fields_changed(
|
|||||||
|
|
||||||
/// Factorize tokenizer building.
|
/// Factorize tokenizer building.
|
||||||
fn tokenizer_builder<'a>(
|
fn tokenizer_builder<'a>(
|
||||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: Option<&'a [&str]>,
|
allowed_separators: Option<&'a [&str]>,
|
||||||
dictionary: Option<&'a [&str]>,
|
dictionary: Option<&'a [&str]>,
|
||||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
tokenizer_builder.stop_words(stop_words);
|
tokenizer_builder.stop_words(stop_words);
|
||||||
@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>(
|
|||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
|
@ -1,15 +1,21 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
use std::iter::FromIterator;
|
||||||
use std::{io, str};
|
use std::{io, str};
|
||||||
|
|
||||||
|
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||||
|
use heed::types::SerdeJson;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
use crate::update::index_documents::helpers::{
|
||||||
use crate::{FieldId, Result};
|
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||||
|
};
|
||||||
|
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
|
|
||||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||||
///
|
///
|
||||||
@ -19,10 +25,11 @@ use crate::{FieldId, Result};
|
|||||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_string: grenad::Reader<R>,
|
docid_fid_facet_string: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
@ -30,12 +37,30 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory.map(|m| m / 2),
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
merge_deladd_btreeset_string,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|m| m / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||||
|
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||||
|
|
||||||
|
// nothing to do if we delete and re-add the value.
|
||||||
|
if deladd_reader.get(DelAdd::Deletion).is_some()
|
||||||
|
&& deladd_reader.get(DelAdd::Addition).is_some()
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
@ -44,17 +69,46 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||||
|
|
||||||
|
// Facet search normalization
|
||||||
|
{
|
||||||
|
let mut hyper_normalized_value = normalized_value.normalize(&options);
|
||||||
|
let normalized_truncated_facet: String;
|
||||||
|
if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH {
|
||||||
|
normalized_truncated_facet = hyper_normalized_value
|
||||||
|
.char_indices()
|
||||||
|
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||||
|
.map(|(_, c)| c)
|
||||||
|
.collect();
|
||||||
|
hyper_normalized_value = normalized_truncated_facet.into();
|
||||||
|
}
|
||||||
|
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
for (deladd_key, _) in deladd_reader.iter() {
|
||||||
|
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||||
|
obkv.insert(deladd_key, val)?;
|
||||||
|
}
|
||||||
|
obkv.finish()?;
|
||||||
|
|
||||||
|
let key = (field_id, hyper_normalized_value.as_ref());
|
||||||
|
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||||
|
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
for (deladd_key, _) in deladd_reader.iter() {
|
||||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
obkv.finish()?;
|
obkv.finish()?;
|
||||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||||
|
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,6 @@ use std::io::BufReader;
|
|||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||||
@ -29,10 +28,7 @@ use self::extract_vector_points::{
|
|||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
|
||||||
MergeFn, MergeableReader,
|
|
||||||
};
|
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::vector::EmbeddingConfigs;
|
use crate::vector::EmbeddingConfigs;
|
||||||
@ -52,7 +48,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
field_id_map: FieldsIdsMap,
|
field_id_map: FieldsIdsMap,
|
||||||
stop_words: Option<fst::Set<&[u8]>>,
|
stop_words: Option<fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
@ -62,201 +58,154 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
original_obkv_chunks
|
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||||
.par_bridge()
|
|| {
|
||||||
.map(|original_documents_chunk| {
|
original_obkv_chunks
|
||||||
send_original_documents_data(
|
.par_bridge()
|
||||||
original_documents_chunk,
|
.map(|original_documents_chunk| {
|
||||||
indexer,
|
send_original_documents_data(
|
||||||
lmdb_writer_sx.clone(),
|
original_documents_chunk,
|
||||||
field_id_map.clone(),
|
indexer,
|
||||||
embedders.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
)
|
field_id_map.clone(),
|
||||||
})
|
embedders.clone(),
|
||||||
.collect::<Result<()>>()?;
|
)
|
||||||
|
})
|
||||||
#[allow(clippy::type_complexity)]
|
.collect::<Result<()>>()
|
||||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
|
},
|
||||||
flattened_obkv_chunks
|
|| {
|
||||||
.par_bridge()
|
flattened_obkv_chunks
|
||||||
.map(|flattened_obkv_chunks| {
|
.par_bridge()
|
||||||
send_and_extract_flattened_documents_data(
|
.map(|flattened_obkv_chunks| {
|
||||||
flattened_obkv_chunks,
|
send_and_extract_flattened_documents_data(
|
||||||
indexer,
|
flattened_obkv_chunks,
|
||||||
lmdb_writer_sx.clone(),
|
indexer,
|
||||||
&searchable_fields,
|
lmdb_writer_sx.clone(),
|
||||||
&faceted_fields,
|
&searchable_fields,
|
||||||
primary_key_id,
|
&faceted_fields,
|
||||||
geo_fields_ids,
|
primary_key_id,
|
||||||
&stop_words,
|
geo_fields_ids,
|
||||||
&allowed_separators,
|
&stop_words,
|
||||||
&dictionary,
|
&allowed_separators,
|
||||||
max_positions_per_attributes,
|
&dictionary,
|
||||||
)
|
max_positions_per_attributes,
|
||||||
})
|
)
|
||||||
.collect();
|
})
|
||||||
|
.map(|result| {
|
||||||
let (
|
if let Ok((
|
||||||
docid_word_positions_chunks,
|
ref docid_word_positions_chunk,
|
||||||
(
|
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
|
||||||
fid_docid_facet_numbers_chunks,
|
)) = result
|
||||||
(
|
{
|
||||||
fid_docid_facet_strings_chunks,
|
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||||
(
|
docid_word_positions_chunk.clone(),
|
||||||
facet_is_null_docids_chunks,
|
indexer,
|
||||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
lmdb_writer_sx.clone(),
|
||||||
),
|
extract_fid_word_count_docids,
|
||||||
),
|
TypedChunk::FieldIdWordCountDocids,
|
||||||
),
|
"field-id-wordcount-docids",
|
||||||
) = result?;
|
);
|
||||||
|
|
||||||
// merge facet_exists_docids and send them as a typed chunk
|
let exact_attributes = exact_attributes.clone();
|
||||||
{
|
run_extraction_task::<
|
||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
_,
|
||||||
rayon::spawn(move || {
|
_,
|
||||||
debug!(database = "facet-id-exists-docids", "merge");
|
(
|
||||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
grenad::Reader<BufReader<File>>,
|
||||||
Ok(reader) => {
|
grenad::Reader<BufReader<File>>,
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
grenad::Reader<BufReader<File>>,
|
||||||
}
|
),
|
||||||
Err(e) => {
|
>(
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
docid_word_positions_chunk.clone(),
|
||||||
}
|
indexer,
|
||||||
}
|
lmdb_writer_sx.clone(),
|
||||||
});
|
move |doc_word_pos, indexer| {
|
||||||
}
|
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
|
||||||
|
},
|
||||||
// merge facet_is_null_docids and send them as a typed chunk
|
|(
|
||||||
{
|
word_docids_reader,
|
||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
exact_word_docids_reader,
|
||||||
rayon::spawn(move || {
|
word_fid_docids_reader,
|
||||||
debug!(database = "facet-id-is-null-docids", "merge");
|
)| {
|
||||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
TypedChunk::WordDocids {
|
||||||
Ok(reader) => {
|
word_docids_reader,
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
exact_word_docids_reader,
|
||||||
}
|
word_fid_docids_reader,
|
||||||
Err(e) => {
|
}
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
},
|
||||||
}
|
"word-docids",
|
||||||
}
|
);
|
||||||
});
|
|
||||||
}
|
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||||
|
docid_word_positions_chunk.clone(),
|
||||||
// merge facet_is_empty_docids and send them as a typed chunk
|
indexer,
|
||||||
{
|
lmdb_writer_sx.clone(),
|
||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
extract_word_position_docids,
|
||||||
rayon::spawn(move || {
|
TypedChunk::WordPositionDocids,
|
||||||
debug!(database = "facet-id-is-empty-docids", "merge");
|
"word-position-docids",
|
||||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
);
|
||||||
Ok(reader) => {
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
run_extraction_task::<
|
||||||
}
|
_,
|
||||||
Err(e) => {
|
_,
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||||
}
|
>(
|
||||||
}
|
fid_docid_facet_strings_chunk.clone(),
|
||||||
});
|
indexer,
|
||||||
}
|
lmdb_writer_sx.clone(),
|
||||||
|
extract_facet_string_docids,
|
||||||
if proximity_precision == ProximityPrecision::ByWord {
|
TypedChunk::FieldIdFacetStringDocids,
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
"field-id-facet-string-docids",
|
||||||
docid_word_positions_chunks.clone(),
|
);
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||||
extract_word_pair_proximity_docids,
|
fid_docid_facet_numbers_chunk.clone(),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
indexer,
|
||||||
TypedChunk::WordPairProximityDocids,
|
lmdb_writer_sx.clone(),
|
||||||
"word-pair-proximity-docids",
|
extract_facet_number_docids,
|
||||||
);
|
TypedChunk::FieldIdFacetNumberDocids,
|
||||||
}
|
"field-id-facet-number-docids",
|
||||||
|
);
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
|
||||||
docid_word_positions_chunks.clone(),
|
if proximity_precision == ProximityPrecision::ByWord {
|
||||||
indexer,
|
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||||
lmdb_writer_sx.clone(),
|
docid_word_positions_chunk.clone(),
|
||||||
extract_fid_word_count_docids,
|
indexer,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
lmdb_writer_sx.clone(),
|
||||||
TypedChunk::FieldIdWordCountDocids,
|
extract_word_pair_proximity_docids,
|
||||||
"field-id-wordcount-docids",
|
TypedChunk::WordPairProximityDocids,
|
||||||
);
|
"word-pair-proximity-docids",
|
||||||
|
);
|
||||||
spawn_extraction_task::<
|
}
|
||||||
_,
|
}
|
||||||
_,
|
|
||||||
Vec<(
|
Ok(())
|
||||||
grenad::Reader<BufReader<File>>,
|
})
|
||||||
grenad::Reader<BufReader<File>>,
|
.collect::<Result<()>>()
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
)>,
|
|
||||||
>(
|
|
||||||
docid_word_positions_chunks.clone(),
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
|
||||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
|
||||||
TypedChunk::WordDocids {
|
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"word-docids",
|
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
original_pipeline_result.and(flattened_pipeline_result)
|
||||||
docid_word_positions_chunks.clone(),
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
|
||||||
extract_word_position_docids,
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
TypedChunk::WordPositionDocids,
|
|
||||||
"word-position-docids",
|
|
||||||
);
|
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
|
||||||
fid_docid_facet_strings_chunks,
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
|
||||||
extract_facet_string_docids,
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
TypedChunk::FieldIdFacetStringDocids,
|
|
||||||
"field-id-facet-string-docids",
|
|
||||||
);
|
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
|
||||||
fid_docid_facet_numbers_chunks,
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx,
|
|
||||||
extract_facet_number_docids,
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
TypedChunk::FieldIdFacetNumberDocids,
|
|
||||||
"field-id-facet-number-docids",
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||||
/// Generated grenad chunks are merged using the merge_fn.
|
/// Generated grenad chunks are merged using the merge_fn.
|
||||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||||
/// and sent into lmdb_writer_sx.
|
/// and sent into lmdb_writer_sx.
|
||||||
fn spawn_extraction_task<FE, FS, M>(
|
fn run_extraction_task<FE, FS, M>(
|
||||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
chunk: grenad::Reader<CursorClonableMmap>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
extract_fn: FE,
|
extract_fn: FE,
|
||||||
merge_fn: MergeFn,
|
|
||||||
serialize_fn: FS,
|
serialize_fn: FS,
|
||||||
name: &'static str,
|
name: &'static str,
|
||||||
) where
|
) where
|
||||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
|
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
|
||||||
+ Sync
|
+ Sync
|
||||||
+ Send
|
+ Send
|
||||||
+ 'static,
|
+ 'static,
|
||||||
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
|
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||||
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
|
M: Send,
|
||||||
M::Output: Send,
|
|
||||||
{
|
{
|
||||||
let current_span = tracing::Span::current();
|
let current_span = tracing::Span::current();
|
||||||
|
|
||||||
@ -264,25 +213,16 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
let child_span =
|
let child_span =
|
||||||
tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks");
|
tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks");
|
||||||
let _entered = child_span.enter();
|
let _entered = child_span.enter();
|
||||||
puffin::profile_scope!("extract_multiple_chunksdexing::details, ", name);
|
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||||
let chunks: Result<M> =
|
match extract_fn(chunk, indexer) {
|
||||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
|
Ok(chunk) => {
|
||||||
let current_span = tracing::Span::current();
|
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||||
|
|
||||||
rayon::spawn(move || match chunks {
|
|
||||||
Ok(chunks) => {
|
|
||||||
let child_span = tracing::trace_span!(target: "", parent: ¤t_span, "merge_multiple_chunks");
|
|
||||||
let _entered = child_span.enter();
|
|
||||||
debug!(database = name, "merge");
|
|
||||||
puffin::profile_scope!("merge_multiple_chunks", name);
|
|
||||||
let reader = chunks.merge(merge_fn, &indexer);
|
|
||||||
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
|
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
let _ = lmdb_writer_sx.send(Err(e));
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
});
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
@ -340,7 +280,7 @@ fn send_original_documents_data(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// TODO: create a custom internal error
|
// TODO: create a custom internal error
|
||||||
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
drop(lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -360,22 +300,13 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: &Option<&[&str]>,
|
allowed_separators: &Option<&[&str]>,
|
||||||
dictionary: &Option<&[&str]>,
|
dictionary: &Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(
|
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||||
grenad::Reader<CursorClonableMmap>,
|
|
||||||
(
|
|
||||||
grenad::Reader<CursorClonableMmap>,
|
|
||||||
(
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)> {
|
)> {
|
||||||
let flattened_documents_chunk =
|
let flattened_documents_chunk =
|
||||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
@ -446,16 +377,17 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
fid_docid_facet_strings_chunk.clone(),
|
fid_docid_facet_strings_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
Ok((
|
let _ = lmdb_writer_sx
|
||||||
fid_docid_facet_numbers_chunk,
|
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||||
(
|
|
||||||
fid_docid_facet_strings_chunk,
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||||
(
|
fid_facet_is_empty_docids_chunk,
|
||||||
fid_facet_is_null_docids_chunk,
|
)));
|
||||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
|
||||||
),
|
let _ = lmdb_writer_sx
|
||||||
),
|
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||||
))
|
|
||||||
|
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -90,90 +90,6 @@ pub unsafe fn as_cloneable_grenad(
|
|||||||
Ok(reader)
|
Ok(reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait MergeableReader
|
|
||||||
where
|
|
||||||
Self: Sized,
|
|
||||||
{
|
|
||||||
type Output;
|
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
|
||||||
type Output = grenad::Reader<BufReader<File>>;
|
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
|
||||||
let mut merger = MergerBuilder::new(merge_fn);
|
|
||||||
self.into_iter().try_for_each(|r| merger.push(r))?;
|
|
||||||
merger.finish(params)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
|
||||||
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
|
||||||
let mut m1 = MergerBuilder::new(merge_fn);
|
|
||||||
let mut m2 = MergerBuilder::new(merge_fn);
|
|
||||||
for (r1, r2) in self.into_iter() {
|
|
||||||
m1.push(r1)?;
|
|
||||||
m2.push(r2)?;
|
|
||||||
}
|
|
||||||
Ok((m1.finish(params)?, m2.finish(params)?))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MergeableReader
|
|
||||||
for Vec<(
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
)>
|
|
||||||
{
|
|
||||||
type Output = (
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
);
|
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
|
||||||
let mut m1 = MergerBuilder::new(merge_fn);
|
|
||||||
let mut m2 = MergerBuilder::new(merge_fn);
|
|
||||||
let mut m3 = MergerBuilder::new(merge_fn);
|
|
||||||
for (r1, r2, r3) in self.into_iter() {
|
|
||||||
m1.push(r1)?;
|
|
||||||
m2.push(r2)?;
|
|
||||||
m3.push(r3)?;
|
|
||||||
}
|
|
||||||
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
|
||||||
fn new(merge_fn: MergeFn) -> Self {
|
|
||||||
Self(grenad::MergerBuilder::new(merge_fn))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
|
|
||||||
self.0.push(reader.into_cursor()?);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
|
||||||
let merger = self.0.build();
|
|
||||||
let mut writer = create_writer(
|
|
||||||
params.chunk_compression_type,
|
|
||||||
params.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
merger.write_into_stream_writer(&mut writer)?;
|
|
||||||
|
|
||||||
writer_into_reader(writer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct GrenadParameters {
|
pub struct GrenadParameters {
|
||||||
pub chunk_compression_type: CompressionType,
|
pub chunk_compression_type: CompressionType,
|
||||||
|
@ -35,27 +35,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
|
||||||
if values.len() == 1 {
|
|
||||||
Ok(values[0].clone())
|
|
||||||
} else {
|
|
||||||
// TODO improve the perf by using a `#[borrow] Cow<str>`.
|
|
||||||
let strings: BTreeSet<String> = values
|
|
||||||
.iter()
|
|
||||||
.map(AsRef::as_ref)
|
|
||||||
.map(serde_json::from_slice::<BTreeSet<String>>)
|
|
||||||
.map(StdResult::unwrap)
|
|
||||||
.reduce(|mut current, new| {
|
|
||||||
for x in new {
|
|
||||||
current.insert(x);
|
|
||||||
}
|
|
||||||
current
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
Ok(values[0].clone())
|
Ok(values[0].clone())
|
||||||
}
|
}
|
||||||
@ -243,3 +222,61 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
|||||||
buffer,
|
buffer,
|
||||||
)?)
|
)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
|
if values.len() == 1 {
|
||||||
|
Ok(values[0].clone())
|
||||||
|
} else {
|
||||||
|
// TODO improve the perf by using a `#[borrow] Cow<str>`.
|
||||||
|
let strings: BTreeSet<String> = values
|
||||||
|
.iter()
|
||||||
|
.map(AsRef::as_ref)
|
||||||
|
.map(serde_json::from_slice::<BTreeSet<String>>)
|
||||||
|
.map(StdResult::unwrap)
|
||||||
|
.reduce(|mut current, new| {
|
||||||
|
for x in new {
|
||||||
|
current.insert(x);
|
||||||
|
}
|
||||||
|
current
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||||
|
/// separately and outputs a new DelAdd with both unions.
|
||||||
|
pub fn merge_deladd_btreeset_string<'a>(
|
||||||
|
_key: &[u8],
|
||||||
|
values: &[Cow<'a, [u8]>],
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
if values.len() == 1 {
|
||||||
|
Ok(values[0].clone())
|
||||||
|
} else {
|
||||||
|
// Retrieve the bitmaps from both sides
|
||||||
|
let mut del_set = BTreeSet::new();
|
||||||
|
let mut add_set = BTreeSet::new();
|
||||||
|
for value in values {
|
||||||
|
let obkv = KvReaderDelAdd::new(value);
|
||||||
|
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||||
|
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||||
|
for value in set {
|
||||||
|
del_set.insert(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||||
|
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||||
|
for value in set {
|
||||||
|
add_set.insert(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||||
|
let del = serde_json::to_vec(&del_set).unwrap();
|
||||||
|
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||||
|
let add = serde_json::to_vec(&add_set).unwrap();
|
||||||
|
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||||
|
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -10,13 +10,13 @@ use fst::{IntoStreamer, Streamer};
|
|||||||
pub use grenad_helpers::{
|
pub use grenad_helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||||
GrenadParameters, MergeableReader,
|
GrenadParameters,
|
||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
|
||||||
obkvs_merge_additions_and_deletions, MergeFn,
|
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::MAX_WORD_LENGTH;
|
use crate::MAX_WORD_LENGTH;
|
||||||
|
@ -5,20 +5,21 @@ mod transform;
|
|||||||
mod typed_chunk;
|
mod typed_chunk;
|
||||||
|
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::io::{Cursor, Read, Seek};
|
use std::io::{Read, Seek};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use crossbeam_channel::{Receiver, Sender};
|
use crossbeam_channel::{Receiver, Sender};
|
||||||
|
use grenad::{Merger, MergerBuilder};
|
||||||
use heed::types::Str;
|
use heed::types::Str;
|
||||||
use heed::Database;
|
use heed::Database;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
use tracing::debug_span;
|
use tracing::debug;
|
||||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
|
||||||
|
|
||||||
use self::enrich::enrich_documents_batch;
|
use self::enrich::enrich_documents_batch;
|
||||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||||
@ -26,8 +27,7 @@ pub use self::helpers::{
|
|||||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
|
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
|
||||||
ClonableMmap, MergeFn,
|
|
||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
@ -95,8 +95,8 @@ pub struct IndexDocumentsConfig {
|
|||||||
|
|
||||||
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
|
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
|
||||||
where
|
where
|
||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync + Send,
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync + Send,
|
||||||
{
|
{
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &'t mut heed::RwTxn<'i>,
|
wtxn: &'t mut heed::RwTxn<'i>,
|
||||||
@ -326,9 +326,6 @@ where
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let original_documents = grenad::Reader::new(original_documents)?;
|
|
||||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
|
||||||
|
|
||||||
// create LMDB writer channel
|
// create LMDB writer channel
|
||||||
let (lmdb_writer_sx, lmdb_writer_rx): (
|
let (lmdb_writer_sx, lmdb_writer_rx): (
|
||||||
Sender<Result<TypedChunk>>,
|
Sender<Result<TypedChunk>>,
|
||||||
@ -367,11 +364,7 @@ where
|
|||||||
|
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||||
let separators: Option<Vec<_>> =
|
|
||||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
|
||||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
let dictionary = self.index.dictionary(self.wtxn)?;
|
||||||
let dictionary: Option<Vec<_>> =
|
|
||||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
|
||||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
||||||
|
|
||||||
@ -381,141 +374,202 @@ where
|
|||||||
max_memory: self.indexer_config.max_memory,
|
max_memory: self.indexer_config.max_memory,
|
||||||
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
||||||
};
|
};
|
||||||
let documents_chunk_size =
|
let documents_chunk_size = match self.indexer_config.documents_chunk_size {
|
||||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
|
Some(chunk_size) => chunk_size,
|
||||||
|
None => {
|
||||||
|
let default_chunk_size = 1024 * 1024 * 4; // 4MiB
|
||||||
|
let min_chunk_size = 1024 * 512; // 512KiB
|
||||||
|
|
||||||
|
// compute the chunk size from the number of available threads and the inputed data size.
|
||||||
|
let total_size = flattened_documents.metadata().map(|m| m.len());
|
||||||
|
let current_num_threads = pool.current_num_threads();
|
||||||
|
// if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
|
||||||
|
let chunk_count = if current_num_threads > 2 {
|
||||||
|
(current_num_threads * 3 / 4).max(2)
|
||||||
|
} else {
|
||||||
|
current_num_threads
|
||||||
|
};
|
||||||
|
total_size
|
||||||
|
.map_or(default_chunk_size, |size| (size as usize) / chunk_count)
|
||||||
|
.max(min_chunk_size)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let original_documents = grenad::Reader::new(original_documents)?;
|
||||||
|
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||||
|
|
||||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||||
|
|
||||||
let cloned_embedder = self.embedders.clone();
|
let cloned_embedder = self.embedders.clone();
|
||||||
|
|
||||||
|
let mut final_documents_ids = RoaringBitmap::new();
|
||||||
|
let mut databases_seen = 0;
|
||||||
|
let mut word_position_docids = None;
|
||||||
|
let mut word_fid_docids = None;
|
||||||
|
let mut word_docids = None;
|
||||||
|
let mut exact_word_docids = None;
|
||||||
|
let mut chunk_accumulator = ChunkAccumulator::default();
|
||||||
|
let mut dimension = HashMap::new();
|
||||||
|
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||||
|
|
||||||
let current_span = tracing::Span::current();
|
let current_span = tracing::Span::current();
|
||||||
|
|
||||||
// Run extraction pipeline in parallel.
|
// Run extraction pipeline in parallel.
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
rayon::spawn(move || {
|
||||||
|
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
||||||
let _enter = child_span.enter();
|
let _enter = child_span.enter();
|
||||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||||
// split obkv file into several chunks
|
// split obkv file into several chunks
|
||||||
let original_chunk_iter =
|
let original_chunk_iter =
|
||||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||||
|
|
||||||
// split obkv file into several chunks
|
// split obkv file into several chunks
|
||||||
let flattened_chunk_iter =
|
let flattened_chunk_iter =
|
||||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||||
|
|
||||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
let separators: Option<Vec<_>> =
|
||||||
let flattened_chunk = flattened_chunk_iter?;
|
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||||
// extract all databases from the chunked obkv douments
|
let dictionary: Option<Vec<_>> =
|
||||||
extract::data_from_obkv_documents(
|
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||||
original_chunk,
|
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||||
flattened_chunk,
|
let flattened_chunk = flattened_chunk_iter?;
|
||||||
pool_params,
|
// extract all databases from the chunked obkv douments
|
||||||
lmdb_writer_sx.clone(),
|
extract::data_from_obkv_documents(
|
||||||
searchable_fields,
|
original_chunk,
|
||||||
faceted_fields,
|
flattened_chunk,
|
||||||
primary_key_id,
|
pool_params,
|
||||||
geo_fields_ids,
|
lmdb_writer_sx.clone(),
|
||||||
field_id_map,
|
searchable_fields,
|
||||||
stop_words,
|
faceted_fields,
|
||||||
separators.as_deref(),
|
primary_key_id,
|
||||||
dictionary.as_deref(),
|
geo_fields_ids,
|
||||||
max_positions_per_attributes,
|
field_id_map,
|
||||||
exact_attributes,
|
stop_words,
|
||||||
proximity_precision,
|
separators.as_deref(),
|
||||||
cloned_embedder,
|
dictionary.as_deref(),
|
||||||
)
|
max_positions_per_attributes,
|
||||||
|
exact_attributes,
|
||||||
|
proximity_precision,
|
||||||
|
cloned_embedder,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Err(e) = result {
|
||||||
|
let _ = lmdb_writer_sx.send(Err(e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// needs to be dropped to avoid channel waiting lock.
|
||||||
|
drop(lmdb_writer_sx);
|
||||||
});
|
});
|
||||||
|
|
||||||
if let Err(e) = result {
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
databases_seen,
|
||||||
}
|
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||||
|
});
|
||||||
|
|
||||||
// needs to be dropped to avoid channel waiting lock.
|
loop {
|
||||||
drop(lmdb_writer_sx);
|
if (self.should_abort)() {
|
||||||
});
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
|
}
|
||||||
|
|
||||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
|
||||||
let mut final_documents_ids = RoaringBitmap::new();
|
Err(status) => {
|
||||||
|
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
|
||||||
|
let (docids, is_merged_database) =
|
||||||
|
write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
|
||||||
|
if !docids.is_empty() {
|
||||||
|
final_documents_ids |= docids;
|
||||||
|
let documents_seen_count = final_documents_ids.len();
|
||||||
|
(self.progress)(UpdateIndexingStep::IndexDocuments {
|
||||||
|
documents_seen: documents_seen_count as usize,
|
||||||
|
total_documents: documents_count,
|
||||||
|
});
|
||||||
|
debug!(documents = documents_seen_count, total = documents_count, "Seen");
|
||||||
|
}
|
||||||
|
if is_merged_database {
|
||||||
|
databases_seen += 1;
|
||||||
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
|
databases_seen,
|
||||||
|
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// If no more chunk remains in the chunk accumulator and the channel is disconected, break.
|
||||||
|
} else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(result) => {
|
||||||
|
let typed_chunk = match result? {
|
||||||
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} => {
|
||||||
|
let cloneable_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
|
let word_docids = word_docids.get_or_insert_with(|| {
|
||||||
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||||
|
});
|
||||||
|
word_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
|
let cloneable_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
|
let exact_word_docids =
|
||||||
|
exact_word_docids.get_or_insert_with(|| {
|
||||||
|
MergerBuilder::new(
|
||||||
|
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
|
let cloneable_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||||
|
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
||||||
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||||
|
});
|
||||||
|
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TypedChunk::WordPositionDocids(chunk) => {
|
||||||
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
|
let word_position_docids =
|
||||||
|
word_position_docids.get_or_insert_with(|| {
|
||||||
|
MergerBuilder::new(
|
||||||
|
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
|
TypedChunk::WordPositionDocids(chunk)
|
||||||
|
}
|
||||||
|
TypedChunk::VectorPoints {
|
||||||
|
expected_dimension,
|
||||||
|
remove_vectors,
|
||||||
|
embeddings,
|
||||||
|
manual_vectors,
|
||||||
|
embedder_name,
|
||||||
|
} => {
|
||||||
|
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||||
|
TypedChunk::VectorPoints {
|
||||||
|
remove_vectors,
|
||||||
|
embeddings,
|
||||||
|
expected_dimension,
|
||||||
|
manual_vectors,
|
||||||
|
embedder_name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
otherwise => otherwise,
|
||||||
|
};
|
||||||
|
|
||||||
let mut databases_seen = 0;
|
chunk_accumulator.insert(typed_chunk);
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
|
||||||
databases_seen,
|
|
||||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut word_position_docids = None;
|
|
||||||
let mut word_fid_docids = None;
|
|
||||||
let mut word_docids = None;
|
|
||||||
let mut exact_word_docids = None;
|
|
||||||
|
|
||||||
let mut dimension = HashMap::new();
|
|
||||||
|
|
||||||
for result in lmdb_writer_rx {
|
|
||||||
if (self.should_abort)() {
|
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
|
||||||
}
|
|
||||||
|
|
||||||
let typed_chunk = match result? {
|
|
||||||
TypedChunk::WordDocids {
|
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
} => {
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
|
||||||
word_docids = Some(cloneable_chunk);
|
|
||||||
let cloneable_chunk =
|
|
||||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
|
||||||
exact_word_docids = Some(cloneable_chunk);
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
|
||||||
word_fid_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordDocids {
|
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::WordPositionDocids(chunk) => {
|
}
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
|
||||||
word_position_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordPositionDocids(chunk)
|
|
||||||
}
|
|
||||||
TypedChunk::VectorPoints {
|
|
||||||
expected_dimension,
|
|
||||||
remove_vectors,
|
|
||||||
embeddings,
|
|
||||||
manual_vectors,
|
|
||||||
embedder_name,
|
|
||||||
} => {
|
|
||||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
|
||||||
TypedChunk::VectorPoints {
|
|
||||||
remove_vectors,
|
|
||||||
embeddings,
|
|
||||||
expected_dimension,
|
|
||||||
manual_vectors,
|
|
||||||
embedder_name,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
otherwise => otherwise,
|
|
||||||
};
|
|
||||||
|
|
||||||
let (docids, is_merged_database) =
|
Ok(())
|
||||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
|
})?;
|
||||||
if !docids.is_empty() {
|
|
||||||
final_documents_ids |= docids;
|
|
||||||
let documents_seen_count = final_documents_ids.len();
|
|
||||||
(self.progress)(UpdateIndexingStep::IndexDocuments {
|
|
||||||
documents_seen: documents_seen_count as usize,
|
|
||||||
total_documents: documents_count,
|
|
||||||
});
|
|
||||||
debug_span!("Seen", documents = documents_seen_count, total = documents_count);
|
|
||||||
}
|
|
||||||
if is_merged_database {
|
|
||||||
databases_seen += 1;
|
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
|
||||||
databases_seen,
|
|
||||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We write the field distribution into the main database
|
// We write the field distribution into the main database
|
||||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||||
@ -548,10 +602,10 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.execute_prefix_databases(
|
self.execute_prefix_databases(
|
||||||
word_docids,
|
word_docids.map(MergerBuilder::build),
|
||||||
exact_word_docids,
|
exact_word_docids.map(MergerBuilder::build),
|
||||||
word_position_docids,
|
word_position_docids.map(MergerBuilder::build),
|
||||||
word_fid_docids,
|
word_fid_docids.map(MergerBuilder::build),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(number_of_documents)
|
Ok(number_of_documents)
|
||||||
@ -565,10 +619,10 @@ where
|
|||||||
)]
|
)]
|
||||||
pub fn execute_prefix_databases(
|
pub fn execute_prefix_databases(
|
||||||
self,
|
self,
|
||||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
@ -751,7 +805,7 @@ where
|
|||||||
)]
|
)]
|
||||||
fn execute_word_prefix_docids(
|
fn execute_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
@ -761,13 +815,12 @@ fn execute_word_prefix_docids(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let cursor = reader.into_cursor()?;
|
|
||||||
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||||
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||||
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
||||||
builder.max_nb_chunks = indexer_config.max_nb_chunks;
|
builder.max_nb_chunks = indexer_config.max_nb_chunks;
|
||||||
builder.max_memory = indexer_config.max_memory;
|
builder.max_memory = indexer_config.max_memory;
|
||||||
builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,27 +5,60 @@ use std::io::{self, BufReader};
|
|||||||
|
|
||||||
use bytemuck::allocation::pod_collect_to_vec;
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
use grenad::MergerBuilder;
|
use grenad::{Merger, MergerBuilder};
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{PutFlags, RwTxn};
|
use heed::RwTxn;
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReader, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
|
self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||||
valid_lmdb_key, CursorClonableMmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
|
||||||
|
CursorClonableMmap,
|
||||||
};
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::MergeFn;
|
||||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::index::db_name::DOCUMENTS;
|
use crate::index::db_name::DOCUMENTS;
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
use crate::update::index_documents::helpers::{
|
||||||
|
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
|
||||||
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
|
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// This struct accumulates and group the TypedChunks
|
||||||
|
/// and is able to give the biggest accumulated group to index them all together
|
||||||
|
/// with a merger.
|
||||||
|
#[derive(Default)]
|
||||||
|
pub(crate) struct ChunkAccumulator {
|
||||||
|
inner: Vec<Vec<TypedChunk>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ChunkAccumulator {
|
||||||
|
pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
|
||||||
|
match self.inner.iter().max_by_key(|v| v.len()) {
|
||||||
|
Some(left) => {
|
||||||
|
let position = self.inner.iter().position(|right| left == right);
|
||||||
|
position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert(&mut self, chunk: TypedChunk) {
|
||||||
|
match self.inner.iter().position(|right| Some(&chunk) == right.first()) {
|
||||||
|
Some(position) => {
|
||||||
|
let v = self.inner.get_mut(position).unwrap();
|
||||||
|
v.push(chunk);
|
||||||
|
}
|
||||||
|
None => self.inner.push(vec![chunk]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||||
@ -38,7 +71,7 @@ pub(crate) enum TypedChunk {
|
|||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||||
@ -54,6 +87,34 @@ pub(crate) enum TypedChunk {
|
|||||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PartialEq for TypedChunk {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
use TypedChunk::*;
|
||||||
|
match (self, other) {
|
||||||
|
(FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
|
||||||
|
| (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
|
||||||
|
| (Documents(_), Documents(_))
|
||||||
|
| (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
|
||||||
|
| (WordDocids { .. }, WordDocids { .. })
|
||||||
|
| (WordPositionDocids(_), WordPositionDocids(_))
|
||||||
|
| (WordPairProximityDocids(_), WordPairProximityDocids(_))
|
||||||
|
| (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
|
||||||
|
| (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
|
||||||
|
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
||||||
|
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
||||||
|
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
||||||
|
| (GeoPoints(_), GeoPoints(_))
|
||||||
|
| (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true,
|
||||||
|
(
|
||||||
|
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
||||||
|
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
||||||
|
) => left == right && left_dim == right_dim,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl Eq for TypedChunk {}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
pub fn to_debug_string(&self) -> String {
|
pub fn to_debug_string(&self) -> String {
|
||||||
match self {
|
match self {
|
||||||
@ -85,7 +146,7 @@ impl TypedChunk {
|
|||||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetStringDocids(grenad) => {
|
TypedChunk::FieldIdFacetStringDocids((grenad, _)) => {
|
||||||
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
|
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
|
||||||
@ -117,23 +178,32 @@ impl TypedChunk {
|
|||||||
/// Return new documents seen.
|
/// Return new documents seen.
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||||
pub(crate) fn write_typed_chunk_into_index(
|
pub(crate) fn write_typed_chunk_into_index(
|
||||||
typed_chunk: TypedChunk,
|
typed_chunks: Vec<TypedChunk>,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index_is_empty: bool,
|
|
||||||
) -> Result<(RoaringBitmap, bool)> {
|
) -> Result<(RoaringBitmap, bool)> {
|
||||||
puffin::profile_function!(typed_chunk.to_debug_string());
|
puffin::profile_function!(typed_chunks[0].to_debug_string());
|
||||||
|
|
||||||
let mut is_merged_database = false;
|
let mut is_merged_database = false;
|
||||||
match typed_chunk {
|
match typed_chunks[0] {
|
||||||
TypedChunk::Documents(obkv_documents_iter) => {
|
TypedChunk::Documents(_) => {
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
let mut operations: Vec<DocumentOperation> = Default::default();
|
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||||
|
|
||||||
let mut docids = index.documents_ids(wtxn)?;
|
let mut docids = index.documents_ids(wtxn)?;
|
||||||
let mut cursor = obkv_documents_iter.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, reader)) = cursor.move_on_next()? {
|
while let Some((key, reader)) = iter.next()? {
|
||||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||||
let reader: KvReader<FieldId> = KvReader::new(reader);
|
let reader: KvReader<FieldId> = KvReader::new(reader);
|
||||||
|
|
||||||
@ -174,59 +244,91 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
external_documents_docids.apply(wtxn, operations)?;
|
external_documents_docids.apply(wtxn, operations)?;
|
||||||
index.put_documents_ids(wtxn, &docids)?;
|
index.put_documents_ids(wtxn, &docids)?;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
TypedChunk::FieldIdWordCountDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
fid_word_count_docids_iter,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.field_id_word_count_docids,
|
&index.field_id_word_count_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids {
|
TypedChunk::WordDocids { .. } => {
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
} => {
|
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
|
||||||
append_entries_into_database(
|
let mut word_docids_builder =
|
||||||
word_docids_iter.clone(),
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
let mut exact_word_docids_builder =
|
||||||
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
let mut word_fid_docids_builder =
|
||||||
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} = typed_chunk
|
||||||
|
else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
|
let clonable_exact_word_docids =
|
||||||
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
|
|
||||||
|
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||||
|
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||||
|
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||||
|
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||||
|
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||||
|
}
|
||||||
|
|
||||||
|
let word_docids_merger = word_docids_builder.build();
|
||||||
|
write_entries_into_database(
|
||||||
|
word_docids_merger,
|
||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
let exact_word_docids_merger = exact_word_docids_builder.build();
|
||||||
append_entries_into_database(
|
write_entries_into_database(
|
||||||
exact_word_docids_iter.clone(),
|
exact_word_docids_merger,
|
||||||
&index.exact_word_docids,
|
&index.exact_word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
let word_fid_docids_merger = word_fid_docids_builder.build();
|
||||||
append_entries_into_database(
|
write_entries_into_database(
|
||||||
word_fid_docids_iter,
|
word_fid_docids_merger,
|
||||||
&index.word_fid_docids,
|
&index.word_fid_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
|
let fst_merger = fst_merger_builder.build();
|
||||||
|
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
|
||||||
let db_fst = index.words_fst(wtxn)?;
|
let db_fst = index.words_fst(wtxn)?;
|
||||||
|
|
||||||
// merge new fst with database fst
|
// merge new fst with database fst
|
||||||
@ -237,98 +339,202 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.put_words_fst(wtxn, &fst)?;
|
index.put_words_fst(wtxn, &fst)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::WordPositionDocids(word_position_docids_iter) => {
|
TypedChunk::WordPositionDocids(_) => {
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
word_position_docids_iter,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.word_position_docids,
|
&index.word_position_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
|
||||||
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
let mut data_size = 0;
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||||
|
else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
data_size += facet_id_number_docids.len();
|
||||||
|
builder.push(facet_id_number_docids.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
|
||||||
indexer.execute(wtxn)?;
|
indexer.execute(wtxn)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => {
|
TypedChunk::FieldIdFacetStringDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter);
|
|
||||||
|
let mut facet_id_string_builder =
|
||||||
|
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
let mut normalized_facet_id_string_builder =
|
||||||
|
MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
|
||||||
|
let mut data_size = 0;
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdFacetStringDocids((
|
||||||
|
facet_id_string_docids,
|
||||||
|
normalized_facet_id_string_docids,
|
||||||
|
)) = typed_chunk
|
||||||
|
else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
data_size += facet_id_string_docids.len();
|
||||||
|
facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
|
||||||
|
normalized_facet_id_string_builder
|
||||||
|
.push(normalized_facet_id_string_docids.into_cursor()?);
|
||||||
|
}
|
||||||
|
let facet_id_string_merger = facet_id_string_builder.build();
|
||||||
|
let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
|
||||||
|
|
||||||
|
let indexer = FacetsUpdate::new(
|
||||||
|
index,
|
||||||
|
FacetType::String,
|
||||||
|
facet_id_string_merger,
|
||||||
|
Some(normalized_facet_id_string_merger),
|
||||||
|
data_size,
|
||||||
|
);
|
||||||
indexer.execute(wtxn)?;
|
indexer.execute(wtxn)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
TypedChunk::FieldIdFacetExistsDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
facet_id_exists_docids,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.facet_id_exists_docids,
|
&index.facet_id_exists_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
|
TypedChunk::FieldIdFacetIsNullDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
facet_id_is_null_docids,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.facet_id_is_null_docids,
|
&index.facet_id_is_null_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
|
TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
|
||||||
let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids");
|
let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
facet_id_is_empty_docids,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.facet_id_is_empty_docids,
|
&index.facet_id_is_empty_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
TypedChunk::WordPairProximityDocids(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
append_entries_into_database(
|
|
||||||
word_pair_proximity_docids_iter,
|
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
|
write_entries_into_database(
|
||||||
|
merger,
|
||||||
&index.word_pair_proximity_docids,
|
&index.word_pair_proximity_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => {
|
TypedChunk::FieldIdDocidFacetNumbers(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
let index_fid_docid_facet_numbers =
|
let index_fid_docid_facet_numbers =
|
||||||
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
||||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
let reader = KvReaderDelAdd::new(value);
|
let reader = KvReaderDelAdd::new(value);
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||||
@ -344,14 +550,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => {
|
TypedChunk::FieldIdDocidFacetStrings(_) => {
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
let index_fid_docid_facet_strings =
|
let index_fid_docid_facet_strings =
|
||||||
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
||||||
let mut cursor = fid_docid_facet_string.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
let reader = KvReaderDelAdd::new(value);
|
let reader = KvReaderDelAdd::new(value);
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||||
@ -367,14 +584,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::GeoPoints(geo_points) => {
|
TypedChunk::GeoPoints(_) => {
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.push(chunk.into_cursor()?);
|
||||||
|
}
|
||||||
|
let merger = builder.build();
|
||||||
|
|
||||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||||
|
|
||||||
let mut cursor = geo_points.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
// convert the key back to a u32 (4 bytes)
|
// convert the key back to a u32 (4 bytes)
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
|
||||||
@ -393,15 +621,38 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.put_geo_rtree(wtxn, &rtree)?;
|
index.put_geo_rtree(wtxn, &rtree)?;
|
||||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
}
|
}
|
||||||
TypedChunk::VectorPoints {
|
TypedChunk::VectorPoints { .. } => {
|
||||||
remove_vectors,
|
|
||||||
manual_vectors,
|
|
||||||
embeddings,
|
|
||||||
expected_dimension,
|
|
||||||
embedder_name,
|
|
||||||
} => {
|
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
let mut params = None;
|
||||||
|
for typed_chunk in typed_chunks {
|
||||||
|
let TypedChunk::VectorPoints {
|
||||||
|
remove_vectors,
|
||||||
|
manual_vectors,
|
||||||
|
embeddings,
|
||||||
|
expected_dimension,
|
||||||
|
embedder_name,
|
||||||
|
} = typed_chunk
|
||||||
|
else {
|
||||||
|
unreachable!();
|
||||||
|
};
|
||||||
|
|
||||||
|
params = Some((expected_dimension, embedder_name));
|
||||||
|
|
||||||
|
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||||
|
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||||
|
if let Some(embeddings) = embeddings {
|
||||||
|
embeddings_builder.push(embeddings.into_cursor()?);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// typed chunks has always at least 1 chunk.
|
||||||
|
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||||
|
|
||||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||||
)?;
|
)?;
|
||||||
@ -419,8 +670,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let writers = writers?;
|
let writers = writers?;
|
||||||
|
|
||||||
// remove vectors for docids we want them removed
|
// remove vectors for docids we want them removed
|
||||||
let mut cursor = remove_vectors.into_cursor()?;
|
let merger = remove_vectors_builder.build();
|
||||||
while let Some((key, _)) = cursor.move_on_next()? {
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
while let Some((key, _)) = iter.next()? {
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
|
||||||
for writer in &writers {
|
for writer in &writers {
|
||||||
@ -432,40 +684,39 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add generated embeddings
|
// add generated embeddings
|
||||||
if let Some(embeddings) = embeddings {
|
let merger = embeddings_builder.build();
|
||||||
let mut cursor = embeddings.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
let data = pod_collect_to_vec(value);
|
let data = pod_collect_to_vec(value);
|
||||||
// it is a code error to have embeddings and not expected_dimension
|
// it is a code error to have embeddings and not expected_dimension
|
||||||
let embeddings =
|
let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||||
crate::vector::Embeddings::from_inner(data, expected_dimension)
|
// code error if we somehow got the wrong dimension
|
||||||
// code error if we somehow got the wrong dimension
|
.unwrap();
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
||||||
let external_docid = if let Ok(Some(Ok(index))) = index
|
let external_docid = if let Ok(Some(Ok(index))) = index
|
||||||
.external_id_of(wtxn, std::iter::once(docid))
|
.external_id_of(wtxn, std::iter::once(docid))
|
||||||
.map(|it| it.into_iter().next())
|
.map(|it| it.into_iter().next())
|
||||||
{
|
{
|
||||||
index
|
index
|
||||||
} else {
|
} else {
|
||||||
format!("internal docid={docid}")
|
format!("internal docid={docid}")
|
||||||
};
|
};
|
||||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||||
external_docid,
|
external_docid,
|
||||||
embeddings.embedding_count(),
|
embeddings.embedding_count(),
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||||
writer.add_item(wtxn, docid, embedding)?;
|
writer.add_item(wtxn, docid, embedding)?;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// perform the manual diff
|
// perform the manual diff
|
||||||
let mut cursor = manual_vectors.into_cursor()?;
|
let merger = manual_vectors_builder.build();
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
while let Some((key, value)) = iter.next()? {
|
||||||
// convert the key back to a u32 (4 bytes)
|
// convert the key back to a u32 (4 bytes)
|
||||||
let (left, _index) = try_split_array_at(key).unwrap();
|
let (left, _index) = try_split_array_at(key).unwrap();
|
||||||
let docid = DocumentId::from_be_bytes(left);
|
let docid = DocumentId::from_be_bytes(left);
|
||||||
@ -519,26 +770,30 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
|
|
||||||
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
TypedChunk::ScriptLanguageDocids(_) => {
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
for (key, (deletion, addition)) in sl_map {
|
|
||||||
let mut db_key_exists = false;
|
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
|
||||||
Some(db_values) => {
|
|
||||||
db_key_exists = true;
|
|
||||||
(db_values - deletion) | addition
|
|
||||||
}
|
|
||||||
None => addition,
|
|
||||||
};
|
|
||||||
|
|
||||||
if final_value.is_empty() {
|
for typed_chunk in typed_chunks {
|
||||||
// If the database entry exists, delete it.
|
let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() };
|
||||||
if db_key_exists {
|
for (key, (deletion, addition)) in sl_map {
|
||||||
index.script_language_docids.delete(wtxn, &key)?;
|
let mut db_key_exists = false;
|
||||||
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
|
Some(db_values) => {
|
||||||
|
db_key_exists = true;
|
||||||
|
(db_values - deletion) | addition
|
||||||
|
}
|
||||||
|
None => addition,
|
||||||
|
};
|
||||||
|
|
||||||
|
if final_value.is_empty() {
|
||||||
|
// If the database entry exists, delete it.
|
||||||
|
if db_key_exists {
|
||||||
|
index.script_language_docids.delete(wtxn, &key)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -557,13 +812,9 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn merge_word_docids_reader_into_fst(
|
fn merge_word_docids_reader_into_fst(
|
||||||
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||||
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
|
||||||
) -> Result<fst::Set<Vec<u8>>> {
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
merger_builder.push(word_docids_iter.into_cursor()?);
|
|
||||||
merger_builder.push(exact_word_docids_iter.into_cursor()?);
|
|
||||||
let mut iter = merger_builder.build().into_stream_merger_iter()?;
|
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
|
||||||
while let Some((k, _)) = iter.next()? {
|
while let Some((k, _)) = iter.next()? {
|
||||||
@ -577,10 +828,9 @@ fn merge_word_docids_reader_into_fst(
|
|||||||
/// merge_values function is used if an entry already exist in the database.
|
/// merge_values function is used if an entry already exist in the database.
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||||
data: grenad::Reader<R>,
|
merger: Merger<R, MergeFn>,
|
||||||
database: &heed::Database<K, V>,
|
database: &heed::Database<K, V>,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index_is_empty: bool,
|
|
||||||
serialize_value: FS,
|
serialize_value: FS,
|
||||||
merge_values: FM,
|
merge_values: FM,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -589,22 +839,17 @@ where
|
|||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||||
{
|
{
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let database = database.remap_types::<Bytes, Bytes>();
|
let database = database.remap_types::<Bytes, Bytes>();
|
||||||
|
|
||||||
let mut cursor = data.into_cursor()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let value = if index_is_empty {
|
let value = match database.get(wtxn, key)? {
|
||||||
Some(serialize_value(value, &mut buffer)?)
|
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||||
} else {
|
None => Some(serialize_value(value, &mut buffer)?),
|
||||||
match database.get(wtxn, key)? {
|
|
||||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
|
||||||
None => Some(serialize_value(value, &mut buffer)?),
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
match value {
|
match value {
|
||||||
Some(value) => database.put(wtxn, key, value)?,
|
Some(value) => database.put(wtxn, key, value)?,
|
||||||
@ -614,62 +859,5 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write provided entries in database using serialize_value function.
|
|
||||||
/// merge_values function is used if an entry already exist in the database.
|
|
||||||
/// All provided entries must be ordered.
|
|
||||||
/// If the index is not empty, write_entries_into_database is called instead.
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
|
||||||
fn append_entries_into_database<R, K, V, FS, FM>(
|
|
||||||
data: grenad::Reader<R>,
|
|
||||||
database: &heed::Database<K, V>,
|
|
||||||
wtxn: &mut RwTxn,
|
|
||||||
index_is_empty: bool,
|
|
||||||
serialize_value: FS,
|
|
||||||
merge_values: FM,
|
|
||||||
) -> Result<()>
|
|
||||||
where
|
|
||||||
R: io::Read + io::Seek,
|
|
||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
|
||||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
|
||||||
K: for<'a> heed::BytesDecode<'a>,
|
|
||||||
{
|
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
|
||||||
|
|
||||||
if !index_is_empty {
|
|
||||||
return write_entries_into_database(
|
|
||||||
data,
|
|
||||||
database,
|
|
||||||
wtxn,
|
|
||||||
false,
|
|
||||||
serialize_value,
|
|
||||||
merge_values,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut database = database.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
|
||||||
|
|
||||||
let mut cursor = data.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
if valid_lmdb_key(key) {
|
|
||||||
debug_assert!(
|
|
||||||
K::bytes_decode(key).is_ok(),
|
|
||||||
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
|
||||||
key.len(),
|
|
||||||
&key
|
|
||||||
);
|
|
||||||
buffer.clear();
|
|
||||||
let value = serialize_value(value, &mut buffer)?;
|
|
||||||
unsafe {
|
|
||||||
// safety: We do not keep a reference to anything that lives inside the database
|
|
||||||
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, value)?
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
)]
|
)]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@ -68,7 +68,8 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
if !common_prefix_fst_words.is_empty() {
|
if !common_prefix_fst_words.is_empty() {
|
||||||
let mut current_prefixes: Option<&&[String]> = None;
|
let mut current_prefixes: Option<&&[String]> = None;
|
||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?;
|
||||||
|
while let Some((word, data)) = new_word_docids_iter.next()? {
|
||||||
current_prefixes = match current_prefixes.take() {
|
current_prefixes = match current_prefixes.take() {
|
||||||
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
|
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
|
||||||
_otherwise => {
|
_otherwise => {
|
||||||
|
@ -52,7 +52,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
)]
|
)]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
new_word_integer_docids: grenad::Reader<CursorClonableMmap>,
|
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@ -69,14 +69,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
self.max_memory,
|
self.max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?;
|
|
||||||
|
|
||||||
if !common_prefix_fst_words.is_empty() {
|
if !common_prefix_fst_words.is_empty() {
|
||||||
// We fetch all the new common prefixes between the previous and new prefix fst.
|
// We fetch all the new common prefixes between the previous and new prefix fst.
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut current_prefixes: Option<&&[String]> = None;
|
let mut current_prefixes: Option<&&[String]> = None;
|
||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? {
|
let mut new_word_integer_docids_iter =
|
||||||
|
new_word_integer_docids.into_stream_merger_iter()?;
|
||||||
|
while let Some((key, data)) = new_word_integer_docids_iter.next()? {
|
||||||
let (word, pos) =
|
let (word, pos) =
|
||||||
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user