mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Compute and store the number of words in documents fields
This commit is contained in:
parent
a7e40a78c1
commit
b377003192
@ -13,7 +13,7 @@ pub struct Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::Serializer for Indexer<'a> {
|
impl<'a> ser::Serializer for Indexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
type SerializeSeq = SeqIndexer<'a>;
|
type SerializeSeq = SeqIndexer<'a>;
|
||||||
type SerializeTuple = TupleIndexer<'a>;
|
type SerializeTuple = TupleIndexer<'a>;
|
||||||
@ -83,8 +83,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
self.indexer.index_text(self.document_id, self.attribute, text);
|
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
|
||||||
Ok(())
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -99,8 +99,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
where T: ser::Serialize,
|
where T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
self.indexer.index_text(self.document_id, self.attribute, &text);
|
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
|
||||||
Ok(())
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -225,7 +225,7 @@ pub struct SeqIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
@ -239,7 +239,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,7 +251,7 @@ pub struct MapIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||||
@ -273,7 +273,7 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -285,7 +285,7 @@ pub struct StructSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_field<T: ?Sized>(
|
fn serialize_field<T: ?Sized>(
|
||||||
@ -305,7 +305,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,7 +317,7 @@ pub struct TupleIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||||
type Ok = ();
|
type Ok = Option<usize>;
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
@ -331,6 +331,6 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
|||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(())
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use meilidb_schema::Schema;
|
use std::collections::HashMap;
|
||||||
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
|
||||||
use crate::{DocumentId, RankedMap};
|
use crate::{DocumentId, RankedMap};
|
||||||
@ -10,6 +11,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
|
|||||||
pub struct Serializer<'a> {
|
pub struct Serializer<'a> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_store: &'a mut RamDocumentStore,
|
pub document_store: &'a mut RamDocumentStore,
|
||||||
|
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
pub indexer: &'a mut RawIndexer,
|
pub indexer: &'a mut RawIndexer,
|
||||||
pub ranked_map: &'a mut RankedMap,
|
pub ranked_map: &'a mut RankedMap,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
@ -135,6 +137,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
document_store: self.document_store,
|
document_store: self.document_store,
|
||||||
|
document_fields_counts: self.document_fields_counts,
|
||||||
indexer: self.indexer,
|
indexer: self.indexer,
|
||||||
ranked_map: self.ranked_map,
|
ranked_map: self.ranked_map,
|
||||||
current_key_name: None,
|
current_key_name: None,
|
||||||
@ -151,6 +154,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
document_store: self.document_store,
|
document_store: self.document_store,
|
||||||
|
document_fields_counts: self.document_fields_counts,
|
||||||
indexer: self.indexer,
|
indexer: self.indexer,
|
||||||
ranked_map: self.ranked_map,
|
ranked_map: self.ranked_map,
|
||||||
})
|
})
|
||||||
@ -172,6 +176,7 @@ pub struct MapSerializer<'a> {
|
|||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &'a mut RamDocumentStore,
|
document_store: &'a mut RamDocumentStore,
|
||||||
|
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &'a mut RawIndexer,
|
indexer: &'a mut RawIndexer,
|
||||||
ranked_map: &'a mut RankedMap,
|
ranked_map: &'a mut RankedMap,
|
||||||
current_key_name: Option<String>,
|
current_key_name: Option<String>,
|
||||||
@ -209,6 +214,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
|||||||
self.schema,
|
self.schema,
|
||||||
self.document_id,
|
self.document_id,
|
||||||
self.document_store,
|
self.document_store,
|
||||||
|
self.document_fields_counts,
|
||||||
self.indexer,
|
self.indexer,
|
||||||
self.ranked_map,
|
self.ranked_map,
|
||||||
&key,
|
&key,
|
||||||
@ -225,6 +231,7 @@ pub struct StructSerializer<'a> {
|
|||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &'a mut RamDocumentStore,
|
document_store: &'a mut RamDocumentStore,
|
||||||
|
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &'a mut RawIndexer,
|
indexer: &'a mut RawIndexer,
|
||||||
ranked_map: &'a mut RankedMap,
|
ranked_map: &'a mut RankedMap,
|
||||||
}
|
}
|
||||||
@ -244,6 +251,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
self.schema,
|
self.schema,
|
||||||
self.document_id,
|
self.document_id,
|
||||||
self.document_store,
|
self.document_store,
|
||||||
|
self.document_fields_counts,
|
||||||
self.indexer,
|
self.indexer,
|
||||||
self.ranked_map,
|
self.ranked_map,
|
||||||
key,
|
key,
|
||||||
@ -260,6 +268,7 @@ fn serialize_value<T: ?Sized>(
|
|||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
document_store: &mut RamDocumentStore,
|
document_store: &mut RamDocumentStore,
|
||||||
|
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||||
indexer: &mut RawIndexer,
|
indexer: &mut RawIndexer,
|
||||||
ranked_map: &mut RankedMap,
|
ranked_map: &mut RankedMap,
|
||||||
key: &str,
|
key: &str,
|
||||||
@ -275,7 +284,9 @@ where T: ser::Serialize,
|
|||||||
|
|
||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let indexer = Indexer { attribute, indexer, document_id };
|
let indexer = Indexer { attribute, indexer, document_id };
|
||||||
value.serialize(indexer)?;
|
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||||
|
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if props.is_ranked() {
|
if props.is_ranked() {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{SetBuilder, set::OpBuilder};
|
||||||
use sdset::{SetOperation, duo::Union};
|
use sdset::{SetOperation, duo::Union};
|
||||||
@ -82,6 +82,7 @@ pub fn apply_documents_addition(
|
|||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
documents_fields_store: store::DocumentsFields,
|
documents_fields_store: store::DocumentsFields,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
@ -90,6 +91,7 @@ pub fn apply_documents_addition(
|
|||||||
{
|
{
|
||||||
let mut document_ids = HashSet::new();
|
let mut document_ids = HashSet::new();
|
||||||
let mut document_store = RamDocumentStore::new();
|
let mut document_store = RamDocumentStore::new();
|
||||||
|
let mut document_fields_counts = HashMap::new();
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new();
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match main_store.schema(writer)? {
|
||||||
@ -112,6 +114,7 @@ pub fn apply_documents_addition(
|
|||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
document_store: &mut document_store,
|
document_store: &mut document_store,
|
||||||
|
document_fields_counts: &mut document_fields_counts,
|
||||||
indexer: &mut indexer,
|
indexer: &mut indexer,
|
||||||
ranked_map: &mut ranked_map,
|
ranked_map: &mut ranked_map,
|
||||||
document_id,
|
document_id,
|
||||||
@ -126,6 +129,7 @@ pub fn apply_documents_addition(
|
|||||||
writer,
|
writer,
|
||||||
main_store,
|
main_store,
|
||||||
documents_fields_store,
|
documents_fields_store,
|
||||||
|
documents_fields_counts_store,
|
||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
docs_words_store,
|
docs_words_store,
|
||||||
ranked_map.clone(),
|
ranked_map.clone(),
|
||||||
@ -137,6 +141,11 @@ pub fn apply_documents_addition(
|
|||||||
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 3. insert new document attributes counts
|
||||||
|
for ((id, attr), count) in document_fields_counts {
|
||||||
|
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
|
||||||
|
}
|
||||||
|
|
||||||
let indexed = indexer.build();
|
let indexed = indexer.build();
|
||||||
let mut delta_words_builder = SetBuilder::memory();
|
let mut delta_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
|
@ -86,6 +86,7 @@ pub fn apply_documents_deletion(
|
|||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
documents_fields_store: store::DocumentsFields,
|
documents_fields_store: store::DocumentsFields,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
@ -140,6 +141,7 @@ pub fn apply_documents_deletion(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for id in document_ids {
|
for id in document_ids {
|
||||||
|
documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
|
||||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||||
deleted_documents.insert(id);
|
deleted_documents.insert(id);
|
||||||
}
|
}
|
||||||
|
@ -138,6 +138,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
|
|||||||
writer,
|
writer,
|
||||||
index.main,
|
index.main,
|
||||||
index.documents_fields,
|
index.documents_fields,
|
||||||
|
index.documents_fields_counts,
|
||||||
index.postings_lists,
|
index.postings_lists,
|
||||||
index.docs_words,
|
index.docs_words,
|
||||||
ranked_map,
|
ranked_map,
|
||||||
@ -160,6 +161,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
|
|||||||
writer,
|
writer,
|
||||||
index.main,
|
index.main,
|
||||||
index.documents_fields,
|
index.documents_fields,
|
||||||
|
index.documents_fields_counts,
|
||||||
index.postings_lists,
|
index.postings_lists,
|
||||||
index.docs_words,
|
index.docs_words,
|
||||||
ranked_map,
|
ranked_map,
|
||||||
|
Loading…
Reference in New Issue
Block a user