convert the field_distribution to a BTreeMap and avoid counting twice the same documents

This commit is contained in:
Tamo 2021-06-17 17:05:34 +02:00
parent 969adaefdf
commit d08cfda796
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
4 changed files with 70 additions and 12 deletions

View File

@ -791,7 +791,7 @@ pub(crate) mod tests {
use std::ops::Deref; use std::ops::Deref;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashmap; use maplit::btreemap;
use tempfile::TempDir; use tempfile::TempDir;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
@ -845,11 +845,54 @@ pub(crate) mod tests {
let field_distribution = index.field_distribution(&rtxn).unwrap(); let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!( assert_eq!(
field_distribution, field_distribution,
hashmap! { btreemap! {
"id".to_string() => 2, "id".to_string() => 2,
"name".to_string() => 2, "name".to_string() => 2,
"age".to_string() => 1, "age".to_string() => 1,
} }
); );
// we add all the documents a second time. we are supposed to get the same
// field_distribution in the end
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"age".to_string() => 1,
}
);
// then we update a document by removing one field and another by adding one field
let content = &br#"[
{ "id": 1, "name": "kevin", "has_dog": true },
{ "id": 2, "name": "bob" }
]"#[..];
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"has_dog".to_string() => 1,
}
);
} }
} }

View File

@ -14,7 +14,7 @@ pub mod tree_level;
pub mod update; pub mod update;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use std::result::Result as StdResult; use std::result::Result as StdResult;
@ -50,7 +50,7 @@ pub type Attribute = u32;
pub type DocumentId = u32; pub type DocumentId = u32;
pub type FieldId = u8; pub type FieldId = u8;
pub type Position = u32; pub type Position = u32;
pub type FieldsDistribution = HashMap<String, u64>; pub type FieldsDistribution = BTreeMap<String, u64>;
type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>; type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;

View File

@ -1,4 +1,4 @@
use std::collections::hash_map::Entry; use std::collections::btree_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use chrono::Utc; use chrono::Utc;

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry;
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek, SeekFrom}; use std::io::{Read, Seek, SeekFrom};
use std::iter::Peekable; use std::iter::Peekable;
@ -419,11 +420,6 @@ impl Transform<'_, '_> {
// we use it and insert it in the list of replaced documents. // we use it and insert it in the list of replaced documents.
replaced_documents_ids.insert(docid); replaced_documents_ids.insert(docid);
// Depending on the update indexing method we will merge
// the document update with the current document or not.
match self.index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid); let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry { InternalError::DatabaseMissingEntry {
@ -431,6 +427,25 @@ impl Transform<'_, '_> {
key: None, key: None,
}, },
)?; )?;
// we remove all the fields that were already counted
for (field_id, _) in base_obkv.iter() {
let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) =
field_distribution.entry(field_name.to_string())
{
match entry.get().checked_sub(1) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
// Depending on the update indexing method we will merge
// the document update with the current document or not.
match self.index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let update_obkv = obkv::KvReader::new(update_obkv); let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice()) (docid, obkv_buffer.as_slice())