feat: Use the new Tokenizer

This commit is contained in:
Clément Renault 2019-02-26 14:49:50 +01:00
parent 397522f277
commit 87f9528791
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
4 changed files with 25 additions and 56 deletions

View File

@ -430,7 +430,6 @@ mod tests {
use std::error::Error; use std::error::Error;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use meilidb_tokenizer::DefaultBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
@ -478,11 +477,10 @@ mod tests {
timestamp: 7654321, timestamp: 7654321,
}; };
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?; let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; let docid0 = builder.update_document(&doc0, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; let docid1 = builder.update_document(&doc1, &stop_words)?;
let view = database.commit_update(builder)?; let view = database.commit_update(builder)?;
@ -549,16 +547,14 @@ mod tests {
timestamp: 7654321, timestamp: 7654321,
}; };
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?; let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; let docid0 = builder.update_document(&doc0, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; let docid1 = builder.update_document(&doc1, &stop_words)?;
database.commit_update(builder)?; database.commit_update(builder)?;
let mut builder = database.start_update(meilidb_index_name)?; let mut builder = database.start_update(meilidb_index_name)?;
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; let docid2 = builder.update_document(&doc2, &stop_words)?;
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; let docid3 = builder.update_document(&doc3, &stop_words)?;
let view = database.commit_update(builder)?; let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?; let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
@ -640,7 +636,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -650,7 +645,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
database.commit_update(builder)?; database.commit_update(builder)?;
@ -688,7 +683,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -698,7 +692,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
database.commit_update(builder)?; database.commit_update(builder)?;
@ -737,7 +731,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -747,7 +740,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
database.commit_update(builder)?; database.commit_update(builder)?;
@ -785,7 +778,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -795,7 +787,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
let view = database.commit_update(builder)?; let view = database.commit_update(builder)?;
@ -833,7 +825,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -843,7 +834,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
let view = database.commit_update(builder)?; let view = database.commit_update(builder)?;
@ -882,7 +873,6 @@ mod bench {
description: String, description: String,
} }
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?; let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42); let mut rng = XorShiftRng::seed_from_u64(42);
@ -892,7 +882,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder, &stop_words)?; builder.update_document(&document, &stop_words)?;
} }
let view = database.commit_update(builder)?; let view = database.commit_update(builder)?;

View File

@ -3,23 +3,20 @@ use std::collections::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
use meilidb_core::{DocumentId, DocIndex}; use meilidb_core::{DocumentId, DocIndex};
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; use meilidb_tokenizer::{Tokenizer, Token, is_cjk};
use crate::database::update::DocumentUpdate; use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError; use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr; use crate::database::schema::SchemaAttr;
pub struct IndexerSerializer<'a, 'b, B> { pub struct IndexerSerializer<'a, 'b> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate<'b>, pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId, pub document_id: DocumentId,
pub attribute: SchemaAttr, pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>, pub stop_words: &'a HashSet<String>,
} }
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B> impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> {
where B: TokenizerBuilder
{
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
@ -49,7 +46,7 @@ where B: TokenizerBuilder
} }
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) { for token in Tokenizer::new(v) {
let Token { word, word_index, char_index } = token; let Token { word, word_index, char_index } = token;
let document_id = self.document_id; let document_id = self.document_id;

View File

@ -2,7 +2,6 @@ use std::collections::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
use meilidb_tokenizer::TokenizerBuilder;
use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer;
@ -12,25 +11,22 @@ use crate::database::serde::SerializerError;
use crate::database::schema::Schema; use crate::database::schema::Schema;
use meilidb_core::DocumentId; use meilidb_core::DocumentId;
pub struct Serializer<'a, 'b, B> { pub struct Serializer<'a, 'b> {
pub schema: &'a Schema, pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate<'b>, pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId, pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>, pub stop_words: &'a HashSet<String>,
} }
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B> impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
where B: TokenizerBuilder
{
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, 'b, B>; type SerializeMap = MapSerializer<'a, 'b>;
type SerializeStruct = StructSerializer<'a, 'b, B>; type SerializeStruct = StructSerializer<'a, 'b>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! { forward_to_unserializable_type! {
@ -142,7 +138,6 @@ where B: TokenizerBuilder
schema: self.schema, schema: self.schema,
document_id: self.document_id, document_id: self.document_id,
update: self.update, update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words, stop_words: self.stop_words,
current_key_name: None, current_key_name: None,
}) })
@ -158,7 +153,6 @@ where B: TokenizerBuilder
schema: self.schema, schema: self.schema,
document_id: self.document_id, document_id: self.document_id,
update: self.update, update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words, stop_words: self.stop_words,
}) })
} }
@ -175,18 +169,15 @@ where B: TokenizerBuilder
} }
} }
pub struct MapSerializer<'a, 'b, B> { pub struct MapSerializer<'a, 'b> {
pub schema: &'a Schema, pub schema: &'a Schema,
pub document_id: DocumentId, pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>, pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>, pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>, pub current_key_name: Option<String>,
} }
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B> impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
where B: TokenizerBuilder
{
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
@ -223,7 +214,6 @@ where B: TokenizerBuilder
if props.is_indexed() { if props.is_indexed() {
let serializer = IndexerSerializer { let serializer = IndexerSerializer {
update: self.update, update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id, document_id: self.document_id,
attribute: attr, attribute: attr,
stop_words: self.stop_words, stop_words: self.stop_words,
@ -244,17 +234,14 @@ where B: TokenizerBuilder
} }
} }
pub struct StructSerializer<'a, 'b, B> { pub struct StructSerializer<'a, 'b> {
pub schema: &'a Schema, pub schema: &'a Schema,
pub document_id: DocumentId, pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>, pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>, pub stop_words: &'a HashSet<String>,
} }
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B> impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
where B: TokenizerBuilder
{
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
@ -274,7 +261,6 @@ where B: TokenizerBuilder
if props.is_indexed() { if props.is_indexed() {
let serializer = IndexerSerializer { let serializer = IndexerSerializer {
update: self.update, update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id, document_id: self.document_id,
attribute: attr, attribute: attr,
stop_words: self.stop_words, stop_words: self.stop_words,

View File

@ -8,7 +8,6 @@ use serde::Serialize;
use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::data::DocIds; use meilidb_core::data::DocIds;
use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
use meilidb_tokenizer::TokenizerBuilder;
use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer; use crate::database::serde::serializer::Serializer;
@ -36,21 +35,18 @@ impl Update {
Update { schema, raw_builder: RawUpdateBuilder::new() } Update { schema, raw_builder: RawUpdateBuilder::new() }
} }
pub fn update_document<T, B>( pub fn update_document<T>(
&mut self, &mut self,
document: T, document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>, stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError> ) -> Result<DocumentId, SerializerError>
where T: Serialize, where T: Serialize,
B: TokenizerBuilder,
{ {
let document_id = self.schema.document_id(&document)?; let document_id = self.schema.document_id(&document)?;
let serializer = Serializer { let serializer = Serializer {
schema: &self.schema, schema: &self.schema,
document_id: document_id, document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: &mut self.raw_builder.document_update(document_id)?, update: &mut self.raw_builder.document_update(document_id)?,
stop_words: stop_words, stop_words: stop_words,
}; };