From bc210bdc006046bd70940a453104bb6256ae43ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 15:17:51 +0200 Subject: [PATCH] Introduce a dedicated function to write proximity entries in database --- .../src/update/index_documents/typed_chunk.rs | 79 ++++++++++++++++--- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index fcd8cfc17..4021dea08 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::RwTxn; +use heed::{BytesDecode, RwTxn}; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; @@ -20,13 +20,16 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{ as_cloneable_grenad, keep_latest_obkv, try_split_array_at, }; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{ - lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, + Result, SerializationError, U8StrStrCodec, }; /// This struct accumulates and group the TypedChunks @@ -122,7 +125,6 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunks: Vec, wtxn: &mut RwTxn, index: &Index, settings_diff: &InnerIndexSettingsDiff, @@ -487,13 +489,22 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - write_entries_into_database( - merger, - &index.word_pair_proximity_docids, - wtxn, - deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - )?; + if settings_diff.only_additional_fields().is_some() { + write_proximity_entries_into_database_additional_searchables( + merger, + &index.word_pair_proximity_docids, + wtxn, + )?; + } else { + write_entries_into_database( + merger, + &index.word_pair_proximity_docids, + wtxn, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + } + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(_) => { @@ -832,3 +843,51 @@ where } Ok(()) } + +/// Akin to the `write_entries_into_database` function but specialized +/// for the case when we only index additional searchable fields only. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] +fn write_proximity_entries_into_database_additional_searchables( + merger: Merger, + database: &heed::Database, + wtxn: &mut RwTxn, +) -> Result<()> +where + R: io::Read + io::Seek, +{ + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + if valid_lmdb_key(key) { + let (proximity_to_insert, word1, word2) = + U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?; + let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) { + Some(value) => { + CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)? + } + None => continue, + }; + + let mut data_to_remove = RoaringBitmap::new(); + for prox in 1..(MAX_DISTANCE as u8) { + let key = (prox, word1, word2); + let database_value = database.get(wtxn, &key)?.unwrap_or_default(); + let value = if prox == proximity_to_insert { + // Proximity that should be changed. + // Union values and remove lower proximity data + (&database_value | &data_to_insert) - &data_to_remove + } else { + // Remove lower proximity data + &database_value - &data_to_remove + }; + + // add the current data in data_to_remove for the next proximities + data_to_remove |= &value; + + if database_value != value { + database.put(wtxn, &key, &value)?; + } + } + } + } + Ok(()) +}