Fix some tests but not all of them

This commit is contained in:
Clément Renault 2024-11-18 17:39:55 +01:00
parent 670aff5553
commit aba8a0e9e0
No known key found for this signature in database
GPG key ID: F250A4C4E3AE5F5F
20 changed files with 1211 additions and 881 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,86 +0,0 @@
use heed::types::Bytes;
use heed::{Database, RoTxn};
use obkv::KvReaderU16;
use roaring::RoaringBitmap;
use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
pub struct ImmutableObkvs<'t> {
ids: RoaringBitmap,
fields_ids_map: FieldsIdsMap,
slices: Vec<&'t [u8]>,
}
impl<'t> ImmutableObkvs<'t> {
/// Creates the structure by fetching all the OBKVs
/// and keeping the transaction making the pointers valid.
pub fn new(
rtxn: &'t RoTxn,
documents_database: Database<BEU32, ObkvCodec>,
fields_ids_map: FieldsIdsMap,
subset: RoaringBitmap,
) -> heed::Result<Self> {
let mut slices = Vec::new();
let documents_database = documents_database.remap_data_type::<Bytes>();
for docid in &subset {
let slice = documents_database.get(rtxn, &docid)?.unwrap();
slices.push(slice);
}
Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
}
/// Returns the OBKVs identified by the given ID.
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
match self
.ids
.rank(docid)
.checked_sub(1)
.and_then(|offset| self.slices.get(offset as usize))
{
Some(&bytes) => Ok(Some(bytes.into())),
None => Ok(None),
}
}
/// Returns the owned rhai::Map identified by the given ID.
pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
let obkv = match self.obkv(docid) {
Ok(Some(obkv)) => obkv,
Ok(None) => return Ok(None),
Err(e) => return Err(e.into()),
};
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
let map: Result<rhai::Map> = all_keys
.iter()
.copied()
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
.map(|(id, value)| {
let name = self.fields_ids_map.name(id).ok_or(
crate::error::FieldIdMapMissingEntry::FieldId {
field_id: id,
process: "all_obkv_to_rhaimap",
},
)?;
let value = serde_json::from_slice(value)
.map_err(crate::error::InternalError::SerdeJson)?;
Ok((name.into(), value))
})
.collect();
map.map(Some)
}
pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
let obkv = match self.obkv(docid) {
Ok(Some(obkv)) => obkv,
Ok(None) => return Ok(None),
Err(e) => return Err(e.into()),
};
all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
}
}
unsafe impl Sync for ImmutableObkvs<'_> {}

View file

@ -48,23 +48,8 @@ pub struct TransformOutput {
/// containing all those documents.
pub struct Transform<'a, 'i> {
pub index: &'i Index,
fields_ids_map: FieldsIdsMap,
indexer_settings: &'a IndexerConfig,
pub index_documents_method: IndexDocumentsMethod,
available_documents_ids: AvailableIds,
// Both grenad follows the same format:
// key | value
// u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
original_sorter: grenad::Sorter<EitherObkvMerge>,
flattened_sorter: grenad::Sorter<EitherObkvMerge>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
// To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
}
/// This enum is specific to the grenad sorter stored in the transform.
@ -75,29 +60,6 @@ pub enum Operation {
Deletion,
}
/// Create a mapping between the field ids found in the document batch and the one that were
/// already present in the index.
///
/// If new fields are present in the addition, they are added to the index field ids map.
fn create_fields_mapping(
index_field_map: &mut FieldsIdsMap,
batch_field_map: &DocumentsBatchIndex,
) -> Result<HashMap<FieldId, FieldId>> {
batch_field_map
.iter()
// we sort by id here to ensure a deterministic mapping of the fields, that preserves
// the original ordering.
.sorted_by_key(|(&id, _)| id)
.map(|(field, name)| match index_field_map.id(name) {
Some(id) => Ok((*field, id)),
None => index_field_map
.insert(name)
.ok_or(Error::UserError(UserError::AttributeLimitReached))
.map(|id| (*field, id)),
})
.collect()
}
impl<'a, 'i> Transform<'a, 'i> {
pub fn new(
wtxn: &mut heed::RwTxn<'_>,
@ -138,19 +100,7 @@ impl<'a, 'i> Transform<'a, 'i> {
);
let documents_ids = index.documents_ids(wtxn)?;
Ok(Transform {
index,
fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings,
available_documents_ids: AvailableIds::new(&documents_ids),
original_sorter,
flattened_sorter,
index_documents_method,
replaced_documents_ids: RoaringBitmap::new(),
new_documents_ids: RoaringBitmap::new(),
new_external_documents_ids_builder: FxHashMap::default(),
documents_count: 0,
})
Ok(Transform { index, indexer_settings, index_documents_method })
}
// Flatten a document from the fields ids map contained in self and insert the new