Fix a documents indexing bug and add a test

2025-04-18 07:57:59 +02:00 · 2020-10-30 12:14:25 +01:00 · 2020-10-30 12:14:25 +01:00 · 7cc1a358f5
commit 7cc1a358f5
parent 99da69c85f
2 changed files with 59 additions and 6 deletions
--- a/src/update/delete_documents.rs
+++ b/src/update/delete_documents.rs
@ -47,7 +47,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
    }
    pub fn execute(self) -> anyhow::Result<usize> {
-        // We retrieve remove the deleted documents ids and write them into the database.
+        // We retrieve the current documents ids that are in the database.
        let mut documents_ids = self.index.documents_ids(self.wtxn)?;
        // We can and must stop removing documents in a database that is empty.
@ -55,8 +55,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            return Ok(0);
        }
        // We remove the documents ids that we want to delete
        // from the documents in the database and write them back.
        let current_documents_ids_len = documents_ids.len();
-        documents_ids.intersect_with(&self.documents_ids);
+        documents_ids.difference_with(&self.documents_ids);
        self.index.put_documents_ids(self.wtxn, &documents_ids)?;
        // We can execute a ClearDocuments operation when the number of documents
@ -80,7 +82,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        // Retrieve the words and the users ids contained in the documents.
        let mut words = Vec::new();
        let mut users_ids = Vec::new();
-        for docid in &documents_ids {
+        for docid in &self.documents_ids {
            // We create an iterator to be able to get the content and delete the document
            // content itself. It's faster to acquire a cursor to get and delete,
            // as we avoid traversing the LMDB B-Tree two times but only once.
@ -144,7 +146,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
            if let Some((key, mut docids)) = iter.next().transpose()? {
                if key == word.as_ref() {
-                    docids.difference_with(&mut documents_ids);
+                    docids.difference_with(&self.documents_ids);
                    if docids.is_empty() {
                        iter.del_current()?;
                        *must_remove = true;
@ -181,7 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?;
        while let Some(result) = iter.next() {
            let ((w1, w2, prox), mut docids) = result?;
-            docids.difference_with(&documents_ids);
+            docids.difference_with(&self.documents_ids);
            if docids.is_empty() {
                iter.del_current()?;
            } else {
@ -189,6 +191,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            }
        }
-        Ok(documents_ids.len() as usize)
+        Ok(self.documents_ids.len() as usize)
    }
 }
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@ -484,3 +484,54 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
        Ok(())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use heed::EnvOpenOptions;
    #[test]
    fn simple_replacement() {
        let path = tempfile::tempdir().unwrap();
        let mut options = EnvOpenOptions::new();
        options.map_size(10 * 1024 * 1024); // 10 MB
        let index = Index::new(options, &path).unwrap();
        // First we send 3 documents with ids from 1 to 3.
        let mut wtxn = index.write_txn().unwrap();
        let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
        // Check that there is 3 documents now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);
        // Second we send 1 document with id 1, to erase the previous ones.
        let mut wtxn = index.write_txn().unwrap();
        let content = &b"id,name\n1,updated kevin\n"[..];
        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
        // Check that there is **always*** 3 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);
        // Third we send 3 documents again to replace the existing ones.
        let mut wtxn = index.write_txn().unwrap();
        let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
        // Check that there is **always*** 3 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);
    }
 }