Make documents additions accept only the last duplicate document

2025-07-04 20:37:15 +02:00 · 2019-10-28 20:40:33 +01:00 · 2019-10-28 20:40:33 +01:00 · ac12a4b9c9
commit ac12a4b9c9
parent af96050944
2 changed files with 8 additions and 16 deletions
--- a/meilidb-core/src/error.rs
+++ b/meilidb-core/src/error.rs
@ -12,7 +12,6 @@ pub enum Error {
    SchemaMissing,
    WordIndexMissing,
    MissingDocumentId,
-    DuplicateDocument,
    Zlmdb(heed::Error),
    Fst(fst::Error),
    SerdeJson(SerdeJsonError),
@ -80,7 +79,6 @@ impl fmt::Display for Error {
            SchemaMissing => write!(f, "this index does not have a schema"),
            WordIndexMissing => write!(f, "this index does not have a word index"),
            MissingDocumentId => write!(f, "document id is missing"),
-            DuplicateDocument => write!(f, "update contains documents with the same id"),
            Zlmdb(e) => write!(f, "heed error; {}", e),
            Fst(e) => write!(f, "fst error; {}", e),
            SerdeJson(e) => write!(f, "serde json error; {}", e),
--- a/meilidb-core/src/update/documents_addition.rs
+++ b/meilidb-core/src/update/documents_addition.rs
@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;

 use fst::{set::OpBuilder, SetBuilder};
 use sdset::{duo::Union, SetOperation};
@ -86,7 +86,7 @@ pub fn apply_documents_addition(
    docs_words_store: store::DocsWords,
    addition: Vec<serde_json::Value>,
 ) -> MResult<()> {
-    let mut documents_ids = HashSet::new();
+    let mut documents_additions = HashMap::new();
    let mut indexer = RawIndexer::new();

    let schema = match main_store.schema(writer)? {
@ -97,19 +97,18 @@ pub fn apply_documents_addition(
    let identifier = schema.identifier_name();

    // 1. store documents ids for future deletion
-    for document in addition.iter() {
+    for document in addition {
        let document_id = match extract_document_id(identifier, &document)? {
            Some(id) => id,
            None => return Err(Error::MissingDocumentId),
        };

-        if !documents_ids.insert(document_id) {
-            return Err(Error::DuplicateDocument);
-        }
+        documents_additions.insert(document_id, document);
    }

    // 2. remove the documents posting lists
-    let number_of_inserted_documents = documents_ids.len();
+    let number_of_inserted_documents = documents_additions.len();
+    let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
    apply_documents_deletion(
        writer,
        main_store,
@ -117,7 +116,7 @@ pub fn apply_documents_addition(
        documents_fields_counts_store,
        postings_lists_store,
        docs_words_store,
-        documents_ids.into_iter().collect(),
+        documents_ids,
    )?;

    let mut ranked_map = match main_store.ranked_map(writer)? {
@ -126,12 +125,7 @@ pub fn apply_documents_addition(
    };

    // 3. index the documents fields in the stores
-    for document in addition {
-        let document_id = match extract_document_id(identifier, &document)? {
-            Some(id) => id,
-            None => return Err(Error::MissingDocumentId),
-        };
-
+    for (document_id, document) in documents_additions {
        let serializer = Serializer {
            txn: writer,
            schema: &schema,