diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs index 08ca6cd7f..16ba148b7 100644 --- a/meilidb/src/database/mod.rs +++ b/meilidb/src/database/mod.rs @@ -430,7 +430,6 @@ mod tests { use std::error::Error; use serde_derive::{Serialize, Deserialize}; - use meilidb_tokenizer::DefaultBuilder; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; @@ -478,11 +477,10 @@ mod tests { timestamp: 7654321, }; - let tokenizer_builder = DefaultBuilder::new(); let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; + let docid0 = builder.update_document(&doc0, &stop_words)?; + let docid1 = builder.update_document(&doc1, &stop_words)?; let view = database.commit_update(builder)?; @@ -549,16 +547,14 @@ mod tests { timestamp: 7654321, }; - let tokenizer_builder = DefaultBuilder::new(); - let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; + let docid0 = builder.update_document(&doc0, &stop_words)?; + let docid1 = builder.update_document(&doc1, &stop_words)?; database.commit_update(builder)?; let mut builder = database.start_update(meilidb_index_name)?; - let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; - let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; + let docid2 = builder.update_document(&doc2, &stop_words)?; + let docid3 = builder.update_document(&doc3, &stop_words)?; let view = database.commit_update(builder)?; let de_doc0: SimpleDoc = view.document_by_id(docid0)?; @@ -640,7 +636,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -650,7 +645,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -688,7 +683,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -698,7 +692,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -737,7 +731,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -747,7 +740,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -785,7 +778,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -795,7 +787,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; @@ -833,7 +825,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -843,7 +834,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; @@ -882,7 +873,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -892,7 +882,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs index f718111dc..ae5a0e4cb 100644 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ b/meilidb/src/database/serde/indexer_serializer.rs @@ -3,23 +3,20 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; use meilidb_core::{DocumentId, DocIndex}; -use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; +use meilidb_tokenizer::{Tokenizer, Token, is_cjk}; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; -pub struct IndexerSerializer<'a, 'b, B> { - pub tokenizer_builder: &'a B, +pub struct IndexerSerializer<'a, 'b> { pub update: &'a mut DocumentUpdate<'b>, pub document_id: DocumentId, pub attribute: SchemaAttr, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; @@ -49,7 +46,7 @@ where B: TokenizerBuilder } fn serialize_str(self, v: &str) -> Result { - for token in self.tokenizer_builder.build(v) { + for token in Tokenizer::new(v) { let Token { word, word_index, char_index } = token; let document_id = self.document_id; diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs index 8b3a05b46..e1be310ed 100644 --- a/meilidb/src/database/serde/serializer.rs +++ b/meilidb/src/database/serde/serializer.rs @@ -2,7 +2,6 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; -use meilidb_tokenizer::TokenizerBuilder; use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer; @@ -12,25 +11,22 @@ use crate::database::serde::SerializerError; use crate::database::schema::Schema; use meilidb_core::DocumentId; -pub struct Serializer<'a, 'b, B> { +pub struct Serializer<'a, 'b> { pub schema: &'a Schema, pub update: &'a mut DocumentUpdate<'b>, pub document_id: DocumentId, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b, B>; - type SerializeStruct = StructSerializer<'a, 'b, B>; + type SerializeMap = MapSerializer<'a, 'b>; + type SerializeStruct = StructSerializer<'a, 'b>; type SerializeStructVariant = ser::Impossible; forward_to_unserializable_type! { @@ -142,7 +138,6 @@ where B: TokenizerBuilder schema: self.schema, document_id: self.document_id, update: self.update, - tokenizer_builder: self.tokenizer_builder, stop_words: self.stop_words, current_key_name: None, }) @@ -158,7 +153,6 @@ where B: TokenizerBuilder schema: self.schema, document_id: self.document_id, update: self.update, - tokenizer_builder: self.tokenizer_builder, stop_words: self.stop_words, }) } @@ -175,18 +169,15 @@ where B: TokenizerBuilder } } -pub struct MapSerializer<'a, 'b, B> { +pub struct MapSerializer<'a, 'b> { pub schema: &'a Schema, pub document_id: DocumentId, pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, pub current_key_name: Option, } -impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; @@ -223,7 +214,6 @@ where B: TokenizerBuilder if props.is_indexed() { let serializer = IndexerSerializer { update: self.update, - tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, stop_words: self.stop_words, @@ -244,17 +234,14 @@ where B: TokenizerBuilder } } -pub struct StructSerializer<'a, 'b, B> { +pub struct StructSerializer<'a, 'b> { pub schema: &'a Schema, pub document_id: DocumentId, pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; @@ -274,7 +261,6 @@ where B: TokenizerBuilder if props.is_indexed() { let serializer = IndexerSerializer { update: self.update, - tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, stop_words: self.stop_words, diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs index 720b7aaf3..f34cf6a8e 100644 --- a/meilidb/src/database/update/mod.rs +++ b/meilidb/src/database/update/mod.rs @@ -8,7 +8,6 @@ use serde::Serialize; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::data::DocIds; use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; -use meilidb_tokenizer::TokenizerBuilder; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::serde::serializer::Serializer; @@ -36,21 +35,18 @@ impl Update { Update { schema, raw_builder: RawUpdateBuilder::new() } } - pub fn update_document( + pub fn update_document( &mut self, document: T, - tokenizer_builder: &B, stop_words: &HashSet, ) -> Result where T: Serialize, - B: TokenizerBuilder, { let document_id = self.schema.document_id(&document)?; let serializer = Serializer { schema: &self.schema, document_id: document_id, - tokenizer_builder: tokenizer_builder, update: &mut self.raw_builder.document_update(document_id)?, stop_words: stop_words, };