From f06355b0bbb9733bfac2554be6f8d6c14e40edca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 11:28:35 +0100 Subject: [PATCH 1/6] Display the time it takes to merge user documents ids --- src/update/index_documents/transform.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index 9847998a5..1fd4a4264 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -3,10 +3,12 @@ use std::convert::TryFrom; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; +use std::time::Instant; use anyhow::{anyhow, Context}; use fst::{IntoStreamer, Streamer}; use grenad::CompressionType; +use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; @@ -457,19 +459,22 @@ impl Transform<'_, '_> { // We create the union between the existing users ids documents ids with the new ones. let new_users_ids_documents_ids = new_users_ids_documents_ids_builder.into_map(); - let union_ = fst::map::OpBuilder::new() + let union_op = fst::map::OpBuilder::new() .add(&users_ids_documents_ids) .add(&new_users_ids_documents_ids) .r#union(); // We stream and merge the new users ids documents ids map with the existing one. + let before_docids_merging = Instant::now(); let mut users_ids_documents_ids_builder = fst::MapBuilder::memory(); - let mut iter = union_.into_stream(); + let mut iter = union_op.into_stream(); while let Some((user_id, vals)) = iter.next() { assert_eq!(vals.len(), 1, "there must be exactly one document id"); users_ids_documents_ids_builder.insert(user_id, vals[0].value)?; } + info!("Documents users ids merging took {:.02?}", before_docids_merging.elapsed()); + Ok(TransformOutput { primary_key, fields_ids_map, From eded5558b2ecf858e515f47916d2f5a5613fe2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 11:54:04 +0100 Subject: [PATCH 2/6] Rename the users ids documents ids into external documents ids --- http-ui/src/main.rs | 4 +- src/index.rs | 16 ++--- src/update/clear_documents.rs | 2 +- src/update/delete_documents.rs | 54 ++++++++--------- src/update/index_documents/mod.rs | 6 +- src/update/index_documents/transform.rs | 80 ++++++++++++------------- 6 files changed, 81 insertions(+), 81 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 094b2fb79..b730344f2 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -605,14 +605,14 @@ async fn main() -> anyhow::Result<()> { let index = index_cloned.clone(); let rtxn = index.read_txn().unwrap(); - let users_ids_documents_ids = index.users_ids_documents_ids(&rtxn).unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { Some(fields) => Cow::Borrowed(fields), None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), }; - match users_ids_documents_ids.get(&id) { + match external_documents_ids.get(&id) { Some(document_id) => { let document_id = document_id as u32; let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); diff --git a/src/index.rs b/src/index.rs index 68d7dfe5f..8da7940a7 100644 --- a/src/index.rs +++ b/src/index.rs @@ -22,7 +22,7 @@ pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const USERS_IDS_DOCUMENTS_IDS_KEY: &str = "users-ids-documents-ids"; +pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; #[derive(Clone)] @@ -119,18 +119,18 @@ impl Index { self.main.get::<_, Str, OwnedType>(rtxn, PRIMARY_KEY_KEY) } - /* users ids documents ids */ + /* external documents ids */ - /// Writes the users ids documents ids, a user id is a byte slice (i.e. `[u8]`) + /// Writes the external documents ids, it is a byte slice (i.e. `[u8]`) /// and refers to an internal id (i.e. `u32`). - pub fn put_users_ids_documents_ids>(&self, wtxn: &mut RwTxn, fst: &fst::Map) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) + pub fn put_external_documents_ids>(&self, wtxn: &mut RwTxn, fst: &fst::Map) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, EXTERNAL_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) } - /// Returns the user ids documents ids map which associate the user ids (i.e. `[u8]`) + /// Returns the external documents ids map which associate the external ids (i.e. `[u8]`) /// with the internal ids (i.e. `u32`). - pub fn users_ids_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, USERS_IDS_DOCUMENTS_IDS_KEY)? { + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, EXTERNAL_DOCUMENTS_IDS_KEY)? { Some(bytes) => Ok(fst::Map::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Map::default().map_data(Cow::Owned)?), } diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index c49ae9104..0e89d43b7 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -27,7 +27,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - self.index.put_users_ids_documents_ids(self.wtxn, &fst::Map::default())?; + self.index.put_external_documents_ids(self.wtxn, &fst::Map::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; // Clear the other databases. diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index d68bca81c..5ccce35f6 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -10,7 +10,7 @@ use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - users_ids_documents_ids: fst::Map>, + external_documents_ids: fst::Map>, documents_ids: RoaringBitmap, } @@ -20,14 +20,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index: &'i Index, ) -> anyhow::Result> { - let users_ids_documents_ids = index - .users_ids_documents_ids(wtxn)? + let external_documents_ids = index + .external_documents_ids(wtxn)? .map_data(Cow::into_owned)?; Ok(DeleteDocuments { wtxn, index, - users_ids_documents_ids, + external_documents_ids, documents_ids: RoaringBitmap::new(), }) } @@ -40,8 +40,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.documents_ids.union_with(docids); } - pub fn delete_user_id(&mut self, user_id: &str) -> Option { - let docid = self.users_ids_documents_ids.get(user_id).map(|id| u32::try_from(id).unwrap())?; + pub fn delete_external_id(&mut self, external_id: &str) -> Option { + let docid = self.external_documents_ids.get(external_id).map(|id| u32::try_from(id).unwrap())?; self.delete_document(docid); Some(docid) } @@ -80,9 +80,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Retrieve the words and the users ids contained in the documents. + // Retrieve the words and the external documents ids contained in the documents. let mut words = Vec::new(); - let mut users_ids = Vec::new(); + let mut external_ids = Vec::new(); for docid in &self.documents_ids { // We create an iterator to be able to get the content and delete the document // content itself. It's faster to acquire a cursor to get and delete, @@ -91,8 +91,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { if let Some(content) = obkv.get(id_field) { - let user_id: SmallString32 = serde_json::from_slice(content).unwrap(); - users_ids.push(user_id); + let external_id: SmallString32 = serde_json::from_slice(content).unwrap(); + external_ids.push(external_id); } iter.del_current()?; } @@ -109,30 +109,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - // We create the FST map of the users ids that we must delete. - users_ids.sort_unstable(); - let users_ids_to_delete = fst::Set::from_iter(users_ids.iter().map(AsRef::as_ref))?; - let users_ids_to_delete = fst::Map::from(users_ids_to_delete.into_fst()); + // We create the FST map of the external ids that we must delete. + external_ids.sort_unstable(); + let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; + let external_ids_to_delete = fst::Map::from(external_ids_to_delete.into_fst()); - let new_users_ids_documents_ids = { - // We acquire the current users ids documents ids map and create - // a difference operation between the current and to-delete users ids. - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.wtxn)?; - let difference = users_ids_documents_ids.op().add(&users_ids_to_delete).difference(); + let new_external_documents_ids = { + // We acquire the current external documents ids map and create + // a difference operation between the current and to-delete external ids. + let external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + let difference = external_documents_ids.op().add(&external_ids_to_delete).difference(); - // We stream the new users ids that does no more contains the to-delete users ids. + // We stream the new external ids that does no more contains the to-delete external ids. let mut iter = difference.into_stream(); - let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); - while let Some((userid, docids)) = iter.next() { - new_users_ids_documents_ids_builder.insert(userid, docids[0].value)?; + let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + new_external_documents_ids_builder.insert(external_id, docids[0].value)?; } // We create an FST map from the above builder. - new_users_ids_documents_ids_builder.into_map() + new_external_documents_ids_builder.into_map() }; - // We write the new users ids into the main database. - self.index.put_users_ids_documents_ids(self.wtxn, &new_users_ids_documents_ids)?; + // We write the new external ids into the main database. + self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; // Maybe we can improve the get performance of the words // if we sort the words first, keeping the LMDB pages in cache. @@ -169,7 +169,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let words_fst = self.index.words_fst(self.wtxn)?; let difference = words_fst.op().add(&words_to_delete).difference(); - // We stream the new users ids that does no more contains the to-delete users ids. + // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_fst_builder = fst::SetBuilder::memory(); new_words_fst_builder.extend_stream(difference.into_stream())?; diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 13b725e19..fe51c6b2b 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -287,7 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids, + external_documents_ids, new_documents_ids, replaced_documents_ids, documents_count, @@ -472,8 +472,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, primary_key)?; - // We write the users_ids_documents_ids into the main database. - self.index.put_users_ids_documents_ids(self.wtxn, &users_ids_documents_ids)?; + // We write the external documents ids into the main database. + self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; // We merge the new documents ids with the existing ones. documents_ids.union_with(&new_documents_ids); diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index 1fd4a4264..3c6acd1a9 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -20,14 +20,14 @@ use super::{create_writer, create_sorter, IndexDocumentsMethod}; pub struct TransformOutput { pub primary_key: u8, pub fields_ids_map: FieldsIdsMap, - pub users_ids_documents_ids: fst::Map>, + pub external_documents_ids: fst::Map>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, pub documents_file: File, } -/// Extract the users ids, deduplicate and compute the new internal documents ids +/// Extract the external ids, deduplicate and compute the new internal documents ids /// and fields ids, writing all the documents under their internal ids into a final file. /// /// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids, @@ -74,7 +74,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); + let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let primary_key = self.index.primary_key(self.rtxn)?; // Deserialize the whole batch of documents in memory. @@ -116,7 +116,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: fst::Map::default(), + external_documents_ids: fst::Map::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), documents_count: 0, @@ -172,7 +172,7 @@ impl Transform<'_, '_> { // We retrieve the user id from the document based on the primary key name, // if the document id isn't present we generate a uuid. - let user_id = match document.get(&primary_key_name) { + let external_id = match document.get(&primary_key_name) { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), @@ -200,19 +200,19 @@ impl Transform<'_, '_> { } else if field_id == primary_key { // We validate the document id [a-zA-Z0-9\-_]. - let user_id = match validate_document_id(&user_id) { + let external_id = match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", user_id)), + None => return Err(anyhow!("invalid document id: {:?}", external_id)), }; // We serialize the document id. - serde_json::to_writer(&mut json_buffer, &user_id)?; + serde_json::to_writer(&mut json_buffer, &external_id)?; writer.insert(field_id, &json_buffer)?; } } // We use the extracted/generated user id as the key for this document. - sorter.insert(user_id.as_bytes(), &obkv_buffer)?; + sorter.insert(external_id.as_bytes(), &obkv_buffer)?; documents_count += 1; } @@ -227,7 +227,7 @@ impl Transform<'_, '_> { primary_key, fields_ids_map, documents_count, - users_ids_documents_ids, + external_documents_ids, progress_callback, ) } @@ -238,7 +238,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); + let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); let headers = csv.headers()?; @@ -252,7 +252,7 @@ impl Transform<'_, '_> { } // Extract the position of the primary key in the current headers, None if not found. - let user_id_pos = match primary_key { + let external_id_pos = match primary_key { Some(primary_key) => { // Te primary key have is known so we must find the position in the CSV headers. let name = fields_ids_map.name(primary_key).expect("found the primary key name"); @@ -263,7 +263,7 @@ impl Transform<'_, '_> { // Returns the field id in the fileds ids map, create an "id" field // in case it is not in the current headers. - let primary_key_field_id = match user_id_pos { + let primary_key_field_id = match external_id_pos { Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"), None => { if !self.autogenerate_docids { @@ -294,7 +294,7 @@ impl Transform<'_, '_> { ); // We write into the sorter to merge and deduplicate the documents - // based on the users ids. + // based on the external ids. let mut json_buffer = Vec::new(); let mut obkv_buffer = Vec::new(); let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; @@ -312,13 +312,13 @@ impl Transform<'_, '_> { } // We extract the user id if we know where it is or generate an UUID V4 otherwise. - let user_id = match user_id_pos { + let external_id = match external_id_pos { Some(pos) => { - let user_id = &record[pos]; + let external_id = &record[pos]; // We validate the document id [a-zA-Z0-9\-_]. - match validate_document_id(&user_id) { + match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", user_id)), + None => return Err(anyhow!("invalid document id: {:?}", external_id)), } }, None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), @@ -328,7 +328,7 @@ impl Transform<'_, '_> { // we return the generated document id instead of the record field. let iter = fields_ids.iter() .map(|(fi, i)| { - let field = if *fi == primary_key_field_id { user_id } else { &record[*i] }; + let field = if *fi == primary_key_field_id { external_id } else { &record[*i] }; (fi, field) }); @@ -341,7 +341,7 @@ impl Transform<'_, '_> { } // We use the extracted/generated user id as the key for this document. - sorter.insert(user_id, &obkv_buffer)?; + sorter.insert(external_id, &obkv_buffer)?; documents_count += 1; } @@ -356,7 +356,7 @@ impl Transform<'_, '_> { primary_key_field_id, fields_ids_map, documents_count, - users_ids_documents_ids, + external_documents_ids, progress_callback, ) } @@ -370,7 +370,7 @@ impl Transform<'_, '_> { primary_key: u8, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, - users_ids_documents_ids: fst::Map>, + external_documents_ids: fst::Map>, progress_callback: F, ) -> anyhow::Result where @@ -388,7 +388,7 @@ impl Transform<'_, '_> { self.max_nb_chunks, self.max_memory, ); - let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); + let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); let mut replaced_documents_ids = RoaringBitmap::new(); let mut new_documents_ids = RoaringBitmap::new(); let mut obkv_buffer = Vec::new(); @@ -396,7 +396,7 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; let mut iter = sorter.into_iter()?; - while let Some((user_id, update_obkv)) = iter.next()? { + while let Some((external_id, update_obkv)) = iter.next()? { if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { @@ -405,9 +405,9 @@ impl Transform<'_, '_> { }); } - let (docid, obkv) = match users_ids_documents_ids.get(user_id) { + let (docid, obkv) = match external_documents_ids.get(external_id) { Some(docid) => { - // If we find the user id in the current users ids documents ids map + // If we find the user id in the current external documents ids map // we use it and insert it in the list of replaced documents. let docid = u32::try_from(docid).expect("valid document id"); replaced_documents_ids.insert(docid); @@ -427,11 +427,11 @@ impl Transform<'_, '_> { } }, None => { - // If this user id is new we add it to the users ids documents ids map + // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. let new_docid = available_documents_ids.next() .context("no more available documents ids")?; - new_users_ids_documents_ids_builder.insert(user_id, new_docid as u64)?; + new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) }, @@ -457,28 +457,28 @@ impl Transform<'_, '_> { let mut documents_file = writer.into_inner()?; documents_file.seek(SeekFrom::Start(0))?; - // We create the union between the existing users ids documents ids with the new ones. - let new_users_ids_documents_ids = new_users_ids_documents_ids_builder.into_map(); + // We create the union between the existing external documents ids with the new ones. + let new_external_documents_ids = new_external_documents_ids_builder.into_map(); let union_op = fst::map::OpBuilder::new() - .add(&users_ids_documents_ids) - .add(&new_users_ids_documents_ids) + .add(&external_documents_ids) + .add(&new_external_documents_ids) .r#union(); - // We stream and merge the new users ids documents ids map with the existing one. + // We stream and merge the new external documents ids map with the existing one. let before_docids_merging = Instant::now(); - let mut users_ids_documents_ids_builder = fst::MapBuilder::memory(); + let mut external_documents_ids_builder = fst::MapBuilder::memory(); let mut iter = union_op.into_stream(); - while let Some((user_id, vals)) = iter.next() { + while let Some((external_id, vals)) = iter.next() { assert_eq!(vals.len(), 1, "there must be exactly one document id"); - users_ids_documents_ids_builder.insert(user_id, vals[0].value)?; + external_documents_ids_builder.insert(external_id, vals[0].value)?; } - info!("Documents users ids merging took {:.02?}", before_docids_merging.elapsed()); + info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: users_ids_documents_ids_builder.into_map(), + external_documents_ids: external_documents_ids_builder.into_map(), new_documents_ids, replaced_documents_ids, documents_count, @@ -496,7 +496,7 @@ impl Transform<'_, '_> { ) -> anyhow::Result { let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn)?; + let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -531,7 +531,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: users_ids_documents_ids.map_data(Cow::into_owned)?, + external_documents_ids: external_documents_ids.map_data(Cow::into_owned)?, new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, From 415c0b86bac80ebbfa63511edacd90b1fc2b3cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 14:48:42 +0100 Subject: [PATCH 3/6] Introduce the ExternalDocumentsIds struct --- src/external_documents_ids.rs | 149 ++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 2 files changed, 150 insertions(+) create mode 100644 src/external_documents_ids.rs diff --git a/src/external_documents_ids.rs b/src/external_documents_ids.rs new file mode 100644 index 000000000..f8765b57e --- /dev/null +++ b/src/external_documents_ids.rs @@ -0,0 +1,149 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use fst::{Streamer, IntoStreamer}; + +pub struct ExternalDocumentsIds<'a> { + pub(crate) hard: fst::Map>, + pub(crate) soft: fst::Map>, +} + +impl<'a> ExternalDocumentsIds<'a> { + pub fn new(hard: fst::Map>, soft: fst::Map>) -> ExternalDocumentsIds<'a> { + ExternalDocumentsIds { hard, soft } + } + + pub fn get>(&self, external_id: A) -> Option { + let external_id = external_id.as_ref(); + match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { + // u64 MAX means deleted in the soft fst map + Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), + _otherwise => None + } + } + + pub fn delete_ids>(&mut self, other: fst::Set) -> fst::Result<()> { + let other = fst::Map::from(other.into_fst()); + let union_op = self.soft.op().add(&other).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_soft_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + if docids.iter().any(|v| v.index == 1) { + // If the `other` set returns a value here it means + // that it must be marked as deleted. + new_soft_builder.insert(external_id, u64::MAX)?; + } else { + new_soft_builder.insert(external_id, docids[0].value)?; + } + } + + drop(iter); + + // We save this new map as the new soft map. + self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; + self.merge_soft_into_hard() + } + + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { + let union_op = self.soft.op().add(other).r#union(); + + let mut new_soft_builder = fst::MapBuilder::memory(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + let id = docids.last().unwrap().value; + new_soft_builder.insert(external_id, id)?; + } + + drop(iter); + + // We save the new map as the new soft map. + self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; + self.merge_soft_into_hard() + } + + fn merge_soft_into_hard(&mut self) -> fst::Result<()> { + if self.soft.len() >= self.hard.len() / 2 { + let union_op = self.hard.op().add(&self.soft).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_hard_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + if docids.len() == 2 { + if docids[1].value != u64::MAX { + new_hard_builder.insert(external_id, docids[1].value)?; + } + } else { + new_hard_builder.insert(external_id, docids[0].value)?; + } + } + + drop(iter); + + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + self.soft = fst::Map::default().map_data(Cow::Owned)?; + } + + Ok(()) + } +} + +impl Default for ExternalDocumentsIds<'static> { + fn default() -> Self { + ExternalDocumentsIds { + hard: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft: fst::Map::default().map_data(Cow::Owned).unwrap(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_insert_delete_ids() { + let mut external_documents_ids = ExternalDocumentsIds::default(); + + let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(1)); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), Some(3)); + assert_eq!(external_documents_ids.get("d"), Some(4)); + + let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(1)); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), Some(3)); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), Some(6)); + assert_eq!(external_documents_ids.get("g"), Some(7)); + + let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap(); + external_documents_ids.delete_ids(del_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), None); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), None); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), None); + assert_eq!(external_documents_ids.get("g"), Some(7)); + + let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(5)); + assert_eq!(external_documents_ids.get("b"), Some(6)); + assert_eq!(external_documents_ids.get("c"), None); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), None); + assert_eq!(external_documents_ids.get("g"), Some(7)); + assert_eq!(external_documents_ids.get("h"), Some(8)); + } +} diff --git a/src/lib.rs b/src/lib.rs index 808c54a4a..fd0156bfa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod criterion; +mod external_documents_ids; mod fields_ids_map; mod index; mod mdfs; From fe82516f9f9314d5615bdcc3e2981a97802a2a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 17:28:41 +0100 Subject: [PATCH 4/6] Use the ExternalDocumentsIds in the Index struct --- src/external_documents_ids.rs | 9 ++++++++- src/index.rs | 37 +++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/external_documents_ids.rs b/src/external_documents_ids.rs index f8765b57e..7c81cdde8 100644 --- a/src/external_documents_ids.rs +++ b/src/external_documents_ids.rs @@ -12,7 +12,14 @@ impl<'a> ExternalDocumentsIds<'a> { ExternalDocumentsIds { hard, soft } } - pub fn get>(&self, external_id: A) -> Option { + pub fn into_static(self) -> ExternalDocumentsIds<'static> { + ExternalDocumentsIds { + hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + } + } + + pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { // u64 MAX means deleted in the soft fst map diff --git a/src/index.rs b/src/index.rs index 8da7940a7..76ddd4fef 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,6 +7,7 @@ use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; +use crate::external_documents_ids::ExternalDocumentsIds; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::Search; @@ -22,7 +23,8 @@ pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; +pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; +pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; #[derive(Clone)] @@ -121,18 +123,33 @@ impl Index { /* external documents ids */ - /// Writes the external documents ids, it is a byte slice (i.e. `[u8]`) - /// and refers to an internal id (i.e. `u32`). - pub fn put_external_documents_ids>(&self, wtxn: &mut RwTxn, fst: &fst::Map) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, EXTERNAL_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) + /// Writes the external documents ids and internal ids (i.e. `u32`). + pub fn put_external_documents_ids<'a>( + &self, + wtxn: &mut RwTxn, + external_documents_ids: &ExternalDocumentsIds<'a>, + ) -> heed::Result<()> + { + let ExternalDocumentsIds { hard, soft } = external_documents_ids; + let hard = hard.as_fst().as_bytes(); + let soft = soft.as_fst().as_bytes(); + self.main.put::<_, Str, ByteSlice>(wtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; + self.main.put::<_, Str, ByteSlice>(wtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; + Ok(()) } - /// Returns the external documents ids map which associate the external ids (i.e. `[u8]`) + /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, EXTERNAL_DOCUMENTS_IDS_KEY)? { - Some(bytes) => Ok(fst::Map::new(bytes)?.map_data(Cow::Borrowed)?), - None => Ok(fst::Map::default().map_data(Cow::Owned)?), + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result> { + let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + match hard.zip(soft) { + Some((hard, soft)) => { + let hard = fst::Map::new(hard)?.map_data(Cow::Borrowed)?; + let soft = fst::Map::new(soft)?.map_data(Cow::Borrowed)?; + Ok(ExternalDocumentsIds::new(hard, soft)) + }, + None => Ok(ExternalDocumentsIds::default()), } } From 27f3ef5f7a0017e2e40bd88bf4eed40857e91070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 17:53:33 +0100 Subject: [PATCH 5/6] Use the new ExternalDocumentsIds struct in the engine --- src/index.rs | 20 +++++++------- src/lib.rs | 1 + src/update/clear_documents.rs | 4 +-- src/update/delete_documents.rs | 35 +++++++------------------ src/update/index_documents/transform.rs | 30 +++++++-------------- 5 files changed, 32 insertions(+), 58 deletions(-) diff --git a/src/index.rs b/src/index.rs index 76ddd4fef..ccaba4ca6 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,11 +7,10 @@ use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; -use crate::external_documents_ids::ExternalDocumentsIds; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::Search; -use crate::{BEU32, DocumentId}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds}; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, @@ -143,14 +142,15 @@ impl Index { pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result> { let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; - match hard.zip(soft) { - Some((hard, soft)) => { - let hard = fst::Map::new(hard)?.map_data(Cow::Borrowed)?; - let soft = fst::Map::new(soft)?.map_data(Cow::Borrowed)?; - Ok(ExternalDocumentsIds::new(hard, soft)) - }, - None => Ok(ExternalDocumentsIds::default()), - } + let hard = match hard { + Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + let soft = match soft { + Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + Ok(ExternalDocumentsIds::new(hard, soft)) } /* fields ids map */ diff --git a/src/lib.rs b/src/lib.rs index fd0156bfa..12a24a59c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; +pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::index::Index; pub use self::search::{Search, SearchResult}; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 0e89d43b7..447dca8b4 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -1,5 +1,5 @@ use roaring::RoaringBitmap; -use crate::Index; +use crate::{ExternalDocumentsIds, Index}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -27,7 +27,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - self.index.put_external_documents_ids(self.wtxn, &fst::Map::default())?; + self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; // Clear the other databases. diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 5ccce35f6..1913ac033 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -1,16 +1,13 @@ -use std::borrow::Cow; -use std::convert::TryFrom; - -use fst::{IntoStreamer, Streamer}; +use fst::IntoStreamer; use roaring::RoaringBitmap; -use crate::{Index, BEU32, SmallString32}; +use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - external_documents_ids: fst::Map>, + external_documents_ids: ExternalDocumentsIds<'static>, documents_ids: RoaringBitmap, } @@ -22,7 +19,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { { let external_documents_ids = index .external_documents_ids(wtxn)? - .map_data(Cow::into_owned)?; + .into_static(); Ok(DeleteDocuments { wtxn, @@ -41,7 +38,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } pub fn delete_external_id(&mut self, external_id: &str) -> Option { - let docid = self.external_documents_ids.get(external_id).map(|id| u32::try_from(id).unwrap())?; + let docid = self.external_documents_ids.get(external_id)?; self.delete_document(docid); Some(docid) } @@ -112,26 +109,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; - let external_ids_to_delete = fst::Map::from(external_ids_to_delete.into_fst()); - let new_external_documents_ids = { - // We acquire the current external documents ids map and create - // a difference operation between the current and to-delete external ids. - let external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - let difference = external_documents_ids.op().add(&external_ids_to_delete).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut iter = difference.into_stream(); - let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); - while let Some((external_id, docids)) = iter.next() { - new_external_documents_ids_builder.insert(external_id, docids[0].value)?; - } - - // We create an FST map from the above builder. - new_external_documents_ids_builder.into_map() - }; + // We acquire the current external documents ids map... + let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + // ...and remove the to-delete external ids. + new_external_documents_ids.delete_ids(external_ids_to_delete)?; // We write the new external ids into the main database. + let new_external_documents_ids = new_external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; // Maybe we can improve the get performance of the words diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index 3c6acd1a9..54619ed4f 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -6,13 +6,12 @@ use std::iter::Peekable; use std::time::Instant; use anyhow::{anyhow, Context}; -use fst::{IntoStreamer, Streamer}; use grenad::CompressionType; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{BEU32, MergeFn, Index, FieldsIdsMap}; +use crate::{BEU32, MergeFn, Index, FieldsIdsMap, ExternalDocumentsIds}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -20,7 +19,7 @@ use super::{create_writer, create_sorter, IndexDocumentsMethod}; pub struct TransformOutput { pub primary_key: u8, pub fields_ids_map: FieldsIdsMap, - pub external_documents_ids: fst::Map>, + pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, @@ -116,7 +115,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: fst::Map::default(), + external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), documents_count: 0, @@ -370,7 +369,7 @@ impl Transform<'_, '_> { primary_key: u8, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, - external_documents_ids: fst::Map>, + mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, ) -> anyhow::Result where @@ -457,28 +456,17 @@ impl Transform<'_, '_> { let mut documents_file = writer.into_inner()?; documents_file.seek(SeekFrom::Start(0))?; - // We create the union between the existing external documents ids with the new ones. - let new_external_documents_ids = new_external_documents_ids_builder.into_map(); - let union_op = fst::map::OpBuilder::new() - .add(&external_documents_ids) - .add(&new_external_documents_ids) - .r#union(); - - // We stream and merge the new external documents ids map with the existing one. let before_docids_merging = Instant::now(); - let mut external_documents_ids_builder = fst::MapBuilder::memory(); - let mut iter = union_op.into_stream(); - while let Some((external_id, vals)) = iter.next() { - assert_eq!(vals.len(), 1, "there must be exactly one document id"); - external_documents_ids_builder.insert(external_id, vals[0].value)?; - } + // We merge the new external ids with existing external documents ids. + let new_external_documents_ids = new_external_documents_ids_builder.into_map(); + external_documents_ids.insert_ids(&new_external_documents_ids)?; info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: external_documents_ids_builder.into_map(), + external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, documents_count, @@ -531,7 +519,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: external_documents_ids.map_data(Cow::into_owned)?, + external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, From 05c95dfdc6cf991fb91bb740bb850d5105e396fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 18:21:22 +0100 Subject: [PATCH 6/6] Introduce an infos subcommand that patches the external documents ids --- src/subcommand/infos.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index aa5cd3d7b..0c7fa36c6 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -13,6 +13,7 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; +const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids"; const ALL_DATABASE_NAMES: &[&str] = &[ MAIN_DB_NAME, @@ -137,6 +138,10 @@ enum Command { #[structopt(short, long, default_value = "words.fst")] output: PathBuf, }, + + /// A command that patches the old external ids + /// into the new external ids format. + PatchToNewExternalIds, } pub fn run(opt: Opt) -> anyhow::Result<()> { @@ -171,9 +176,32 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) }, ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), + PatchToNewExternalIds => { + drop(rtxn); + let mut wtxn = index.write_txn()?; + let result = patch_to_new_external_ids(&index, &mut wtxn); + wtxn.commit()?; + result + } } } +fn patch_to_new_external_ids(index: &Index, wtxn: &mut heed::RwTxn) -> anyhow::Result<()> { + use heed::types::ByteSlice; + + if let Some(documents_ids) = index.main.get::<_, ByteSlice, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)? { + let documents_ids = documents_ids.to_owned(); + index.main.put::<_, ByteSlice, ByteSlice>( + wtxn, + crate::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(), + &documents_ids, + )?; + index.main.delete::<_, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)?; + } + + Ok(()) +} + fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::collections::BinaryHeap; use std::cmp::Reverse;