From b377003192fd00925b6ea1b50b78c780c7d4ac70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 14:07:10 +0200 Subject: [PATCH] Compute and store the number of words in documents fields --- meilidb-core/src/serde/indexer.rs | 26 +++++++++---------- meilidb-core/src/serde/serializer.rs | 15 +++++++++-- meilidb-core/src/update/documents_addition.rs | 11 +++++++- meilidb-core/src/update/documents_deletion.rs | 2 ++ meilidb-core/src/update/mod.rs | 2 ++ 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/meilidb-core/src/serde/indexer.rs b/meilidb-core/src/serde/indexer.rs index 69a7ddecf..514b97951 100644 --- a/meilidb-core/src/serde/indexer.rs +++ b/meilidb-core/src/serde/indexer.rs @@ -13,7 +13,7 @@ pub struct Indexer<'a> { } impl<'a> ser::Serializer for Indexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; type SerializeSeq = SeqIndexer<'a>; type SerializeTuple = TupleIndexer<'a>; @@ -83,8 +83,8 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_str(self, text: &str) -> Result { - self.indexer.index_text(self.document_id, self.attribute, text); - Ok(()) + let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text); + Ok(Some(number_of_words)) } fn serialize_bytes(self, _v: &[u8]) -> Result { @@ -99,8 +99,8 @@ impl<'a> ser::Serializer for Indexer<'a> { where T: ser::Serialize, { let text = value.serialize(ConvertToString)?; - self.indexer.index_text(self.document_id, self.attribute, &text); - Ok(()) + let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text); + Ok(Some(number_of_words)) } fn serialize_unit(self) -> Result { @@ -225,7 +225,7 @@ pub struct SeqIndexer<'a> { } impl<'a> ser::SerializeSeq for SeqIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> @@ -239,7 +239,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -251,7 +251,7 @@ pub struct MapIndexer<'a> { } impl<'a> ser::SerializeMap for MapIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> @@ -273,7 +273,7 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -285,7 +285,7 @@ pub struct StructSerializer<'a> { } impl<'a> ser::SerializeStruct for StructSerializer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_field( @@ -305,7 +305,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -317,7 +317,7 @@ pub struct TupleIndexer<'a> { } impl<'a> ser::SerializeTuple for TupleIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> @@ -331,6 +331,6 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } diff --git a/meilidb-core/src/serde/serializer.rs b/meilidb-core/src/serde/serializer.rs index 57e436c5a..de09c57bc 100644 --- a/meilidb-core/src/serde/serializer.rs +++ b/meilidb-core/src/serde/serializer.rs @@ -1,4 +1,5 @@ -use meilidb_schema::Schema; +use std::collections::HashMap; +use meilidb_schema::{Schema, SchemaAttr}; use serde::ser; use crate::{DocumentId, RankedMap}; @@ -10,6 +11,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, pub document_store: &'a mut RamDocumentStore, + pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, pub indexer: &'a mut RawIndexer, pub ranked_map: &'a mut RankedMap, pub document_id: DocumentId, @@ -135,6 +137,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, document_store: self.document_store, + document_fields_counts: self.document_fields_counts, indexer: self.indexer, ranked_map: self.ranked_map, current_key_name: None, @@ -151,6 +154,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, document_store: self.document_store, + document_fields_counts: self.document_fields_counts, indexer: self.indexer, ranked_map: self.ranked_map, }) @@ -172,6 +176,7 @@ pub struct MapSerializer<'a> { schema: &'a Schema, document_id: DocumentId, document_store: &'a mut RamDocumentStore, + document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, current_key_name: Option, @@ -209,6 +214,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { self.schema, self.document_id, self.document_store, + self.document_fields_counts, self.indexer, self.ranked_map, &key, @@ -225,6 +231,7 @@ pub struct StructSerializer<'a> { schema: &'a Schema, document_id: DocumentId, document_store: &'a mut RamDocumentStore, + document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, } @@ -244,6 +251,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { self.schema, self.document_id, self.document_store, + self.document_fields_counts, self.indexer, self.ranked_map, key, @@ -260,6 +268,7 @@ fn serialize_value( schema: &Schema, document_id: DocumentId, document_store: &mut RamDocumentStore, + documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &mut RawIndexer, ranked_map: &mut RankedMap, key: &str, @@ -275,7 +284,9 @@ where T: ser::Serialize, if props.is_indexed() { let indexer = Indexer { attribute, indexer, document_id }; - value.serialize(indexer)?; + if let Some(number_of_words) = value.serialize(indexer)? { + documents_fields_counts.insert((document_id, attribute), number_of_words as u64); + } } if props.is_ranked() { diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs index a9420a528..cb662617c 100644 --- a/meilidb-core/src/update/documents_addition.rs +++ b/meilidb-core/src/update/documents_addition.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use fst::{SetBuilder, set::OpBuilder}; use sdset::{SetOperation, duo::Union}; @@ -82,6 +82,7 @@ pub fn apply_documents_addition( writer: &mut rkv::Writer, main_store: store::Main, documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, mut ranked_map: RankedMap, @@ -90,6 +91,7 @@ pub fn apply_documents_addition( { let mut document_ids = HashSet::new(); let mut document_store = RamDocumentStore::new(); + let mut document_fields_counts = HashMap::new(); let mut indexer = RawIndexer::new(); let schema = match main_store.schema(writer)? { @@ -112,6 +114,7 @@ pub fn apply_documents_addition( let serializer = Serializer { schema: &schema, document_store: &mut document_store, + document_fields_counts: &mut document_fields_counts, indexer: &mut indexer, ranked_map: &mut ranked_map, document_id, @@ -126,6 +129,7 @@ pub fn apply_documents_addition( writer, main_store, documents_fields_store, + documents_fields_counts_store, postings_lists_store, docs_words_store, ranked_map.clone(), @@ -137,6 +141,11 @@ pub fn apply_documents_addition( documents_fields_store.put_document_field(writer, id, attr, &value)?; } + // 3. insert new document attributes counts + for ((id, attr), count) in document_fields_counts { + documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?; + } + let indexed = indexer.build(); let mut delta_words_builder = SetBuilder::memory(); diff --git a/meilidb-core/src/update/documents_deletion.rs b/meilidb-core/src/update/documents_deletion.rs index 765eb52f2..b56cebeb6 100644 --- a/meilidb-core/src/update/documents_deletion.rs +++ b/meilidb-core/src/update/documents_deletion.rs @@ -86,6 +86,7 @@ pub fn apply_documents_deletion( writer: &mut rkv::Writer, main_store: store::Main, documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, mut ranked_map: RankedMap, @@ -140,6 +141,7 @@ pub fn apply_documents_deletion( } for id in document_ids { + documents_fields_counts_store.del_all_document_fields_counts(writer, id)?; if documents_fields_store.del_all_document_fields(writer, id)? != 0 { deleted_documents.insert(id); } diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs index 2493a1c94..7d4e349c0 100644 --- a/meilidb-core/src/update/mod.rs +++ b/meilidb-core/src/update/mod.rs @@ -138,6 +138,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult MResult