mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Add FacetsUpdate type that wraps incremental and bulk indexing methods
This commit is contained in:
parent
3d145d7f48
commit
9b55e582cd
@ -1,10 +1,11 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
use heed::{BytesEncode, Error, RoTxn};
|
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
@ -14,21 +15,27 @@ use crate::facet::FacetType;
|
|||||||
use crate::heed_codec::facet::new::{
|
use crate::heed_codec::facet::new::{
|
||||||
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
||||||
};
|
};
|
||||||
use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader};
|
use crate::update::index_documents::{
|
||||||
use crate::{FieldId, Index, Result};
|
create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader,
|
||||||
|
};
|
||||||
|
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
pub struct FacetsUpdateBulk<'i> {
|
pub struct FacetsUpdateBulk<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
|
||||||
level_group_size: usize,
|
level_group_size: usize,
|
||||||
min_level_size: usize,
|
min_level_size: usize,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
|
// None if level 0 does not need to be updated
|
||||||
|
new_data: Option<grenad::Reader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> {
|
pub fn new(
|
||||||
|
index: &'i Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<File>,
|
||||||
|
) -> FacetsUpdateBulk<'i> {
|
||||||
FacetsUpdateBulk {
|
FacetsUpdateBulk {
|
||||||
index,
|
index,
|
||||||
database: match facet_type {
|
database: match facet_type {
|
||||||
@ -39,11 +46,31 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
chunk_compression_type: CompressionType::None,
|
|
||||||
chunk_compression_level: None,
|
|
||||||
level_group_size: 4,
|
level_group_size: 4,
|
||||||
min_level_size: 5,
|
min_level_size: 5,
|
||||||
facet_type,
|
facet_type,
|
||||||
|
new_data: Some(new_data),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_not_updating_level_0(
|
||||||
|
index: &'i Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
) -> FacetsUpdateBulk<'i> {
|
||||||
|
FacetsUpdateBulk {
|
||||||
|
index,
|
||||||
|
database: match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
||||||
|
}
|
||||||
|
FacetType::Number => {
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
level_group_size: 4,
|
||||||
|
min_level_size: 5,
|
||||||
|
facet_type,
|
||||||
|
new_data: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,39 +97,84 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
|
|
||||||
// We get the faceted fields to be able to create the facet levels.
|
// We get the faceted fields to be able to create the facet levels.
|
||||||
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
|
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
|
||||||
|
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
|
||||||
|
|
||||||
for &field_id in faceted_fields.iter() {
|
for &field_id in faceted_fields.iter() {
|
||||||
self.clear_levels(wtxn, field_id)?;
|
self.clear_levels(wtxn, field_id)?;
|
||||||
}
|
}
|
||||||
|
self.update_level0(wtxn)?;
|
||||||
|
|
||||||
let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?;
|
// let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?;
|
||||||
|
|
||||||
for &field_id in faceted_fields.iter() {
|
for &field_id in faceted_fields.iter() {
|
||||||
let (level_readers, all_docids) =
|
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?;
|
||||||
self.compute_levels_for_field_id(field_id, &nested_wtxn)?;
|
|
||||||
|
|
||||||
let put_docids_fn = match self.facet_type {
|
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?;
|
||||||
FacetType::Number => Index::put_number_faceted_documents_ids,
|
|
||||||
FacetType::String => Index::put_string_faceted_documents_ids,
|
|
||||||
};
|
|
||||||
put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?;
|
|
||||||
|
|
||||||
for level_reader in level_readers {
|
for level_reader in level_readers {
|
||||||
// TODO: append instead of write with merge
|
let mut cursor = level_reader.into_cursor()?;
|
||||||
write_into_lmdb_database(
|
while let Some((k, v)) = cursor.move_on_next()? {
|
||||||
&mut nested_wtxn,
|
let key = FacetKeyCodec::<DecodeIgnore>::bytes_decode(k).unwrap();
|
||||||
*self.database.as_polymorph(),
|
let value = FacetGroupValueCodec::bytes_decode(v).unwrap();
|
||||||
level_reader,
|
println!("inserting {key:?} {value:?}");
|
||||||
|_, _| {
|
|
||||||
Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?
|
self.database.remap_types::<ByteSlice, ByteSlice>().put(wtxn, k, v)?;
|
||||||
},
|
}
|
||||||
)?;
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||||
|
let new_data = match self.new_data.take() {
|
||||||
|
Some(x) => x,
|
||||||
|
None => return Ok(()),
|
||||||
|
};
|
||||||
|
if self.database.is_empty(wtxn)? {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut database = self.database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
let mut cursor = new_data.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
if valid_lmdb_key(key) {
|
||||||
|
buffer.clear();
|
||||||
|
// the group size for level 0
|
||||||
|
buffer.push(1);
|
||||||
|
// then we extend the buffer with the docids bitmap
|
||||||
|
buffer.extend_from_slice(value);
|
||||||
|
unsafe { database.append(key, &buffer)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let database = self.database.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
|
||||||
|
let mut cursor = new_data.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
if valid_lmdb_key(key) {
|
||||||
|
buffer.clear();
|
||||||
|
// the group size for level 0
|
||||||
|
buffer.push(1);
|
||||||
|
// then we extend the buffer with the docids bitmap
|
||||||
|
match database.get(wtxn, key)? {
|
||||||
|
Some(prev_value) => {
|
||||||
|
let old_bitmap = &prev_value[1..];
|
||||||
|
CboRoaringBitmapCodec::merge_into(
|
||||||
|
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
||||||
|
&mut buffer,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
buffer.extend_from_slice(value);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
database.put(wtxn, key, &buffer)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,16 +186,14 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
|
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
|
||||||
let algo = FacetsUpdateBulkAlgorithm {
|
// TODO: first check whether there is anything in level 0
|
||||||
|
let algo = ComputeHigherLevels {
|
||||||
rtxn: txn,
|
rtxn: txn,
|
||||||
db: &self.database,
|
db: &self.database,
|
||||||
field_id,
|
field_id,
|
||||||
level_group_size: self.level_group_size,
|
level_group_size: self.level_group_size,
|
||||||
min_level_size: self.min_level_size,
|
min_level_size: self.min_level_size,
|
||||||
chunk_compression_type: self.chunk_compression_type,
|
|
||||||
chunk_compression_level: self.chunk_compression_level,
|
|
||||||
};
|
};
|
||||||
// TODO: first check whether there is anything in level 0
|
|
||||||
|
|
||||||
let mut all_docids = RoaringBitmap::new();
|
let mut all_docids = RoaringBitmap::new();
|
||||||
let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| {
|
let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| {
|
||||||
@ -138,16 +208,14 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FacetsUpdateBulkAlgorithm<'t> {
|
struct ComputeHigherLevels<'t> {
|
||||||
rtxn: &'t heed::RoTxn<'t>,
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
chunk_compression_type: CompressionType,
|
|
||||||
chunk_compression_level: Option<u32>,
|
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
level_group_size: usize,
|
level_group_size: usize,
|
||||||
min_level_size: usize,
|
min_level_size: usize,
|
||||||
}
|
}
|
||||||
impl<'t> FacetsUpdateBulkAlgorithm<'t> {
|
impl<'t> ComputeHigherLevels<'t> {
|
||||||
fn read_level_0(
|
fn read_level_0(
|
||||||
&self,
|
&self,
|
||||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||||
@ -215,11 +283,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
|
|||||||
// once we have computed `level_group_size` elements, we give the left bound
|
// once we have computed `level_group_size` elements, we give the left bound
|
||||||
// of those elements, and their bitmaps, to the level above
|
// of those elements, and their bitmaps, to the level above
|
||||||
|
|
||||||
let mut cur_writer = create_writer(
|
let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
let mut cur_writer_len = 0;
|
let mut cur_writer_len = 0;
|
||||||
|
|
||||||
let mut group_sizes = vec![];
|
let mut group_sizes = vec![];
|
||||||
@ -259,7 +323,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
})?;
|
})?;
|
||||||
// don't forget to insert the leftover elements into the writer as well
|
// don't forget to insert the leftover elements into the writer as well
|
||||||
if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size {
|
if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size {
|
||||||
let left_bound = left_bounds.first().unwrap();
|
let left_bound = left_bounds.first().unwrap();
|
||||||
handle_group(&bitmaps, left_bound)?;
|
handle_group(&bitmaps, left_bound)?;
|
||||||
for ((bitmap, left_bound), group_size) in
|
for ((bitmap, left_bound), group_size) in
|
||||||
@ -274,7 +338,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
|
|||||||
cur_writer_len += 1;
|
cur_writer_len += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if cur_writer_len > self.level_group_size * self.min_level_size {
|
if cur_writer_len > self.min_level_size {
|
||||||
sub_writers.push(writer_into_reader(cur_writer)?);
|
sub_writers.push(writer_into_reader(cur_writer)?);
|
||||||
}
|
}
|
||||||
return Ok(sub_writers);
|
return Ok(sub_writers);
|
||||||
@ -315,9 +379,9 @@ mod tests {
|
|||||||
documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone());
|
documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone());
|
||||||
}
|
}
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = documents_batch_reader_from_objects(documents);
|
||||||
|
dbg!();
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
dbg!();
|
||||||
db_snap!(index, facet_id_f64_docids, name);
|
db_snap!(index, facet_id_f64_docids, name);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,2 +1,90 @@
|
|||||||
|
use std::{collections::HashMap, fs::File};
|
||||||
|
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
facet::FacetType,
|
||||||
|
heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice},
|
||||||
|
CboRoaringBitmapCodec, FieldId, Index, Result,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{FacetsUpdateBulk, FacetsUpdateIncremental};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod incremental;
|
pub mod incremental;
|
||||||
|
|
||||||
|
pub struct FacetsUpdate<'i> {
|
||||||
|
index: &'i Index,
|
||||||
|
database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
level_group_size: u8,
|
||||||
|
max_level_group_size: u8,
|
||||||
|
min_level_size: u8,
|
||||||
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<File>,
|
||||||
|
}
|
||||||
|
impl<'i> FacetsUpdate<'i> {
|
||||||
|
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
||||||
|
let database = match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
||||||
|
}
|
||||||
|
FacetType::Number => {
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Self {
|
||||||
|
index,
|
||||||
|
database,
|
||||||
|
level_group_size: 4,
|
||||||
|
max_level_group_size: 8,
|
||||||
|
min_level_size: 5,
|
||||||
|
facet_type,
|
||||||
|
new_data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// /// The number of elements from the level below that are represented by a single element in the level above
|
||||||
|
// ///
|
||||||
|
// /// This setting is always greater than or equal to 2.
|
||||||
|
// pub fn level_group_size(&mut self, value: u8) -> &mut Self {
|
||||||
|
// self.level_group_size = std::cmp::max(value, 2);
|
||||||
|
// self
|
||||||
|
// }
|
||||||
|
|
||||||
|
// /// The minimum number of elements that a level is allowed to have.
|
||||||
|
// pub fn min_level_size(&mut self, value: u8) -> &mut Self {
|
||||||
|
// self.min_level_size = std::cmp::max(value, 1);
|
||||||
|
// self
|
||||||
|
// }
|
||||||
|
|
||||||
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
|
if self.database.is_empty(wtxn)? {
|
||||||
|
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data);
|
||||||
|
bulk_update.execute(wtxn)?;
|
||||||
|
} else {
|
||||||
|
let indexer = FacetsUpdateIncremental::new(self.database);
|
||||||
|
|
||||||
|
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||||
|
|
||||||
|
let mut cursor = self.new_data.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let key =
|
||||||
|
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||||
|
let docids =
|
||||||
|
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||||
|
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||||
|
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (field_id, new_docids) in new_faceted_docids {
|
||||||
|
let mut docids =
|
||||||
|
self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
||||||
|
docids |= new_docids;
|
||||||
|
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
947949d1a5c9c4e895c89fba46cbba68
|
07718df52f8463335fb8fefcd3ae01f4
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
5ce8009d3eb023e4b9c0a6e7fa4e6262
|
3e6a91b3c54c614a4787224ac4278ed3
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/facet/bulk.rs
|
source: milli/src/update/facet/bulk.rs
|
||||||
---
|
---
|
||||||
5ce8009d3eb023e4b9c0a6e7fa4e6262
|
3e6a91b3c54c614a4787224ac4278ed3
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
@ -14,12 +13,12 @@ use super::helpers::{
|
|||||||
valid_lmdb_key, CursorClonableMmap,
|
valid_lmdb_key, CursorClonableMmap,
|
||||||
};
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
|
use crate::facet::FacetType;
|
||||||
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::update::FacetsUpdateIncremental;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint,
|
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
||||||
Index, Result,
|
Result,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
@ -138,78 +137,14 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
||||||
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
|
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
||||||
// is FacetGroupValue and not RoaringBitmap
|
indexer.execute(wtxn)?;
|
||||||
// so I need to create my own merging function
|
|
||||||
|
|
||||||
// facet_id_string_docids is encoded as:
|
|
||||||
// key: FacetKeyCodec<StrRefCodec>
|
|
||||||
// value: CboRoaringBitmapCodec
|
|
||||||
// basically
|
|
||||||
|
|
||||||
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
|
|
||||||
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
|
|
||||||
// it is a ratio I determine empirically
|
|
||||||
|
|
||||||
// for now I only do it incrementally, to see if things work
|
|
||||||
let indexer = FacetsUpdateIncremental::new(
|
|
||||||
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
|
||||||
|
|
||||||
let mut cursor = facet_id_f64_docids_iter.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let key =
|
|
||||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
|
||||||
let docids =
|
|
||||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
|
||||||
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
|
||||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
|
||||||
}
|
|
||||||
for (field_id, new_docids) in new_faceted_docids {
|
|
||||||
let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?;
|
|
||||||
docids |= new_docids;
|
|
||||||
index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => {
|
||||||
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
|
let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter);
|
||||||
// is FacetGroupValue and not RoaringBitmap
|
indexer.execute(wtxn)?;
|
||||||
// so I need to create my own merging function
|
|
||||||
|
|
||||||
// facet_id_string_docids is encoded as:
|
|
||||||
// key: FacetKeyCodec<StrRefCodec>
|
|
||||||
// value: CboRoaringBitmapCodec
|
|
||||||
// basically
|
|
||||||
|
|
||||||
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
|
|
||||||
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
|
|
||||||
// it is a ratio I determine empirically
|
|
||||||
|
|
||||||
// for now I only do it incrementally, to see if things work
|
|
||||||
let indexer = FacetsUpdateIncremental::new(
|
|
||||||
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
|
||||||
);
|
|
||||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
|
||||||
|
|
||||||
let mut cursor = facet_id_string_docids.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let key =
|
|
||||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
|
||||||
let docids =
|
|
||||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
|
||||||
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
|
||||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
|
||||||
}
|
|
||||||
for (field_id, new_docids) in new_faceted_docids {
|
|
||||||
let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?;
|
|
||||||
docids |= new_docids;
|
|
||||||
index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?;
|
|
||||||
}
|
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
||||||
|
Loading…
Reference in New Issue
Block a user