document update

This commit is contained in:
mpostma 2020-05-05 22:28:46 +02:00
parent 270c7b0288
commit e07fe017c1
9 changed files with 56 additions and 85 deletions

1
Cargo.lock generated
View File

@ -1685,6 +1685,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"toml", "toml",
"zerocopy",
] ]
[[package]] [[package]]

View File

@ -197,7 +197,7 @@ impl fmt::Display for FacetError {
InvalidFormat(found) => write!(f, "invalid facet: {}, facets should be \"facetName:facetValue\"", found), InvalidFormat(found) => write!(f, "invalid facet: {}, facets should be \"facetName:facetValue\"", found),
AttributeNotFound(attr) => write!(f, "unknown {:?} attribute", attr), AttributeNotFound(attr) => write!(f, "unknown {:?} attribute", attr),
AttributeNotSet { found, expected } => write!(f, "`{}` is not set as a faceted attribute. available facet attributes: {}", found, expected.join(", ")), AttributeNotSet { found, expected } => write!(f, "`{}` is not set as a faceted attribute. available facet attributes: {}", found, expected.join(", ")),
InvalidDocumentAttribute(attr) => write!(f, "invalid document attribute {}, accepted types: string and [string]", attr), InvalidDocumentAttribute(attr) => write!(f, "invalid document attribute {}, accepted types: String and [String]", attr),
} }
} }
} }

View File

@ -70,7 +70,7 @@ impl FacetFilter {
} }
return Ok(Self(filter)); return Ok(Self(filter));
} }
bad_value => Err(FacetError::unexpected_token(&["String"], bad_value)), bad_value => Err(FacetError::unexpected_token(&["Array"], bad_value)),
} }
} }
} }

View File

@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::sync::Arc; use std::sync::Arc;
use std::collections::HashMap; use std::collections::HashMap;

View File

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
use crate::database::{MainT, UpdateT}; use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::facets;
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, serialize_value_with_id, Deserializer, Serializer}; use crate::serde::{extract_document_id, serialize_value_with_id, Deserializer, Serializer};
use crate::store; use crate::store;
@ -103,10 +104,11 @@ pub fn push_documents_addition<D: serde::Serialize>(
Ok(last_update_id) Ok(last_update_id)
} }
pub fn apply_documents_addition<'a, 'b>( pub fn apply_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b, MainT>, writer: &'a mut heed::RwTxn<'b, MainT>,
index: &store::Index, index: &store::Index,
addition: Vec<IndexMap<String, serde_json::Value>>, addition: Vec<IndexMap<String, serde_json::Value>>,
partial: bool
) -> MResult<()> { ) -> MResult<()> {
let mut documents_additions = HashMap::new(); let mut documents_additions = HashMap::new();
@ -118,12 +120,30 @@ pub fn apply_documents_addition<'a, 'b>(
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?;
// 1. store documents ids for future deletion // 1. store documents ids for future deletion
for document in addition { for mut document in addition {
let document_id = match extract_document_id(&primary_key, &document)? { let document_id = match extract_document_id(&primary_key, &document)? {
Some(id) => id, Some(id) => id,
None => return Err(Error::MissingDocumentId), None => return Err(Error::MissingDocumentId),
}; };
if partial {
let mut deserializer = Deserializer {
document_id,
reader: writer,
documents_fields: index.documents_fields,
schema: &schema,
fields: None,
};
// retrieve the old document and
// update the new one with missing keys found in the old one
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = result {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
}
documents_additions.insert(document_id, document); documents_additions.insert(document_id, document);
} }
@ -143,6 +163,11 @@ pub fn apply_documents_addition<'a, 'b>(
}; };
// 3. index the documents fields in the stores // 3. index the documents fields in the stores
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let facet_map = facets::facet_map_from_docs(&schema, &documents_additions, attributes_for_facetting.as_ref())?;
index.facets.add(writer, facet_map)?;
}
let mut indexer = RawIndexer::new(stop_words); let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions { for (document_id, document) in documents_additions {
@ -177,85 +202,15 @@ pub fn apply_documents_partial_addition<'a, 'b>(
index: &store::Index, index: &store::Index,
addition: Vec<IndexMap<String, serde_json::Value>>, addition: Vec<IndexMap<String, serde_json::Value>>,
) -> MResult<()> { ) -> MResult<()> {
let mut documents_additions = HashMap::new(); apply_addition(writer, index, addition, true)
}
let mut schema = match index.main.schema(writer)? { pub fn apply_documents_addition<'a, 'b>(
Some(schema) => schema, writer: &'a mut heed::RwTxn<'b, MainT>,
None => return Err(Error::SchemaMissing), index: &store::Index,
}; addition: Vec<IndexMap<String, serde_json::Value>>,
) -> MResult<()> {
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; apply_addition(writer, index, addition, false)
// 1. store documents ids for future deletion
for mut document in addition {
let document_id = match extract_document_id(&primary_key, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let mut deserializer = Deserializer {
document_id,
reader: writer,
documents_fields: index.documents_fields,
schema: &schema,
fields: None,
};
// retrieve the old document and
// update the new one with missing keys found in the old one
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = result {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
documents_additions.insert(document_id, document);
}
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(writer, index, documents_ids)?;
let mut ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match index.main.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &mut schema,
document_store: index.documents_fields,
document_fields_counts: index.documents_fields_counts,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
Ok(())
} }
pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> { pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
@ -277,6 +232,7 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
index.main.put_words_fst(writer, &fst::Set::default())?; index.main.put_words_fst(writer, &fst::Set::default())?;
index.main.put_ranked_map(writer, &ranked_map)?; index.main.put_ranked_map(writer, &ranked_map)?;
index.main.put_number_of_documents(writer, |_| 0)?; index.main.put_number_of_documents(writer, |_| 0)?;
index.facets.clear(writer)?;
index.postings_lists.clear(writer)?; index.postings_lists.clear(writer)?;
index.docs_words.clear(writer)?; index.docs_words.clear(writer)?;
@ -289,6 +245,11 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
let mut indexer = RawIndexer::new(stop_words); let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new(); let mut ram_store = HashMap::new();
if let Some(ref attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let facet_map = facets::facet_map_from_docids(writer, &index, &documents_ids_to_reindex, &attributes_for_facetting)?;
index.facets.add(writer, facet_map)?;
}
// ^-- https://github.com/meilisearch/MeiliSearch/pull/631#issuecomment-626624470 --v
for document_id in documents_ids_to_reindex { for document_id in documents_ids_to_reindex {
for result in index.documents_fields.document_fields(writer, document_id)? { for result in index.documents_fields.document_fields(writer, document_id)? {
let (field_id, bytes) = result?; let (field_id, bytes) = result?;

View File

@ -6,6 +6,7 @@ use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
use crate::database::{MainT, UpdateT}; use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::facets;
use crate::serde::extract_document_id; use crate::serde::extract_document_id;
use crate::store; use crate::store;
use crate::update::{next_update_id, compute_short_prefixes, Update}; use crate::update::{next_update_id, compute_short_prefixes, Update};
@ -88,8 +89,6 @@ pub fn apply_documents_deletion(
index: &store::Index, index: &store::Index,
deletion: Vec<DocumentId>, deletion: Vec<DocumentId>,
) -> MResult<()> { ) -> MResult<()> {
let idset = SetBuf::from_dirty(deletion);
let schema = match index.main.schema(writer)? { let schema = match index.main.schema(writer)? {
Some(schema) => schema, Some(schema) => schema,
None => return Err(Error::SchemaMissing), None => return Err(Error::SchemaMissing),
@ -100,9 +99,16 @@ pub fn apply_documents_deletion(
None => RankedMap::default(), None => RankedMap::default(),
}; };
// facet filters deletion
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
let facet_map = facets::facet_map_from_docids(writer, &index, &deletion, &attributes_for_facetting)?;
index.facets.remove(writer, facet_map)?;
}
// collect the ranked attributes according to the schema // collect the ranked attributes according to the schema
let ranked_fields = schema.ranked(); let ranked_fields = schema.ranked();
let idset = SetBuf::from_dirty(deletion);
let mut words_document_ids = HashMap::new(); let mut words_document_ids = HashMap::new();
for id in idset { for id in idset {
// remove all the ranked attributes from the ranked_map // remove all the ranked attributes from the ranked_map

View File

@ -11,3 +11,4 @@ indexmap = { version = "1.3.2", features = ["serde-1"] }
serde = { version = "1.0.105", features = ["derive"] } serde = { version = "1.0.105", features = ["derive"] }
serde_json = { version = "1.0.50", features = ["preserve_order"] } serde_json = { version = "1.0.50", features = ["preserve_order"] }
toml = { version = "0.5.6", features = ["preserve_order"] } toml = { version = "0.5.6", features = ["preserve_order"] }
zerocopy = "0.3.0"

View File

@ -6,6 +6,7 @@ pub use error::{Error, SResult};
pub use fields_map::FieldsMap; pub use fields_map::FieldsMap;
pub use schema::Schema; pub use schema::Schema;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Default, PartialOrd, Ord, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, Default, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct IndexedPos(pub u16); pub struct IndexedPos(pub u16);