mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
feat: Use the new Tokenizer
This commit is contained in:
parent
397522f277
commit
87f9528791
@ -430,7 +430,6 @@ mod tests {
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
use meilidb_tokenizer::DefaultBuilder;
|
|
||||||
|
|
||||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||||
|
|
||||||
@ -478,11 +477,10 @@ mod tests {
|
|||||||
timestamp: 7654321,
|
timestamp: 7654321,
|
||||||
};
|
};
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
|
||||||
let mut builder = database.start_update(meilidb_index_name)?;
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
|
|
||||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
let docid0 = builder.update_document(&doc0, &stop_words)?;
|
||||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
let docid1 = builder.update_document(&doc1, &stop_words)?;
|
||||||
|
|
||||||
let view = database.commit_update(builder)?;
|
let view = database.commit_update(builder)?;
|
||||||
|
|
||||||
@ -549,16 +547,14 @@ mod tests {
|
|||||||
timestamp: 7654321,
|
timestamp: 7654321,
|
||||||
};
|
};
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
|
||||||
|
|
||||||
let mut builder = database.start_update(meilidb_index_name)?;
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
let docid0 = builder.update_document(&doc0, &stop_words)?;
|
||||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
let docid1 = builder.update_document(&doc1, &stop_words)?;
|
||||||
database.commit_update(builder)?;
|
database.commit_update(builder)?;
|
||||||
|
|
||||||
let mut builder = database.start_update(meilidb_index_name)?;
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
let docid2 = builder.update_document(&doc2, &stop_words)?;
|
||||||
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
let docid3 = builder.update_document(&doc3, &stop_words)?;
|
||||||
let view = database.commit_update(builder)?;
|
let view = database.commit_update(builder)?;
|
||||||
|
|
||||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||||
@ -640,7 +636,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -650,7 +645,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
database.commit_update(builder)?;
|
database.commit_update(builder)?;
|
||||||
@ -688,7 +683,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -698,7 +692,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
database.commit_update(builder)?;
|
database.commit_update(builder)?;
|
||||||
@ -737,7 +731,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -747,7 +740,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
database.commit_update(builder)?;
|
database.commit_update(builder)?;
|
||||||
@ -785,7 +778,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -795,7 +787,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let view = database.commit_update(builder)?;
|
let view = database.commit_update(builder)?;
|
||||||
@ -833,7 +825,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -843,7 +834,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let view = database.commit_update(builder)?;
|
let view = database.commit_update(builder)?;
|
||||||
@ -882,7 +873,6 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
|
||||||
let mut builder = database.start_update(index_name)?;
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
@ -892,7 +882,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let view = database.commit_update(builder)?;
|
let view = database.commit_update(builder)?;
|
||||||
|
@ -3,23 +3,20 @@ use std::collections::HashSet;
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
use meilidb_core::{DocumentId, DocIndex};
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk};
|
use meilidb_tokenizer::{Tokenizer, Token, is_cjk};
|
||||||
|
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
|
|
||||||
pub struct IndexerSerializer<'a, 'b, B> {
|
pub struct IndexerSerializer<'a, 'b> {
|
||||||
pub tokenizer_builder: &'a B,
|
|
||||||
pub update: &'a mut DocumentUpdate<'b>,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub attribute: SchemaAttr,
|
pub attribute: SchemaAttr,
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
|
impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> {
|
||||||
where B: TokenizerBuilder
|
|
||||||
{
|
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
@ -49,7 +46,7 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
for token in self.tokenizer_builder.build(v) {
|
for token in Tokenizer::new(v) {
|
||||||
let Token { word, word_index, char_index } = token;
|
let Token { word, word_index, char_index } = token;
|
||||||
let document_id = self.document_id;
|
let document_id = self.document_id;
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ use std::collections::HashSet;
|
|||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
use meilidb_tokenizer::TokenizerBuilder;
|
|
||||||
|
|
||||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||||
@ -12,25 +11,22 @@ use crate::database::serde::SerializerError;
|
|||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use meilidb_core::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
pub struct Serializer<'a, 'b, B> {
|
pub struct Serializer<'a, 'b> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub update: &'a mut DocumentUpdate<'b>,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub tokenizer_builder: &'a B,
|
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
|
impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
|
||||||
where B: TokenizerBuilder
|
|
||||||
{
|
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeMap = MapSerializer<'a, 'b, B>;
|
type SerializeMap = MapSerializer<'a, 'b>;
|
||||||
type SerializeStruct = StructSerializer<'a, 'b, B>;
|
type SerializeStruct = StructSerializer<'a, 'b>;
|
||||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
forward_to_unserializable_type! {
|
forward_to_unserializable_type! {
|
||||||
@ -142,7 +138,6 @@ where B: TokenizerBuilder
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
update: self.update,
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
|
||||||
stop_words: self.stop_words,
|
stop_words: self.stop_words,
|
||||||
current_key_name: None,
|
current_key_name: None,
|
||||||
})
|
})
|
||||||
@ -158,7 +153,6 @@ where B: TokenizerBuilder
|
|||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
update: self.update,
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
|
||||||
stop_words: self.stop_words,
|
stop_words: self.stop_words,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -175,18 +169,15 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct MapSerializer<'a, 'b, B> {
|
pub struct MapSerializer<'a, 'b> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate<'b>,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub tokenizer_builder: &'a B,
|
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
pub current_key_name: Option<String>,
|
pub current_key_name: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
|
impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
|
||||||
where B: TokenizerBuilder
|
|
||||||
{
|
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
@ -223,7 +214,6 @@ where B: TokenizerBuilder
|
|||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let serializer = IndexerSerializer {
|
let serializer = IndexerSerializer {
|
||||||
update: self.update,
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
attribute: attr,
|
attribute: attr,
|
||||||
stop_words: self.stop_words,
|
stop_words: self.stop_words,
|
||||||
@ -244,17 +234,14 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StructSerializer<'a, 'b, B> {
|
pub struct StructSerializer<'a, 'b> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate<'b>,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub tokenizer_builder: &'a B,
|
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
|
impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
|
||||||
where B: TokenizerBuilder
|
|
||||||
{
|
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
@ -274,7 +261,6 @@ where B: TokenizerBuilder
|
|||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let serializer = IndexerSerializer {
|
let serializer = IndexerSerializer {
|
||||||
update: self.update,
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
attribute: attr,
|
attribute: attr,
|
||||||
stop_words: self.stop_words,
|
stop_words: self.stop_words,
|
||||||
|
@ -8,7 +8,6 @@ use serde::Serialize;
|
|||||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
use meilidb_core::data::DocIds;
|
use meilidb_core::data::DocIds;
|
||||||
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||||
use meilidb_tokenizer::TokenizerBuilder;
|
|
||||||
|
|
||||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::database::serde::serializer::Serializer;
|
use crate::database::serde::serializer::Serializer;
|
||||||
@ -36,21 +35,18 @@ impl Update {
|
|||||||
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_document<T, B>(
|
pub fn update_document<T>(
|
||||||
&mut self,
|
&mut self,
|
||||||
document: T,
|
document: T,
|
||||||
tokenizer_builder: &B,
|
|
||||||
stop_words: &HashSet<String>,
|
stop_words: &HashSet<String>,
|
||||||
) -> Result<DocumentId, SerializerError>
|
) -> Result<DocumentId, SerializerError>
|
||||||
where T: Serialize,
|
where T: Serialize,
|
||||||
B: TokenizerBuilder,
|
|
||||||
{
|
{
|
||||||
let document_id = self.schema.document_id(&document)?;
|
let document_id = self.schema.document_id(&document)?;
|
||||||
|
|
||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
schema: &self.schema,
|
schema: &self.schema,
|
||||||
document_id: document_id,
|
document_id: document_id,
|
||||||
tokenizer_builder: tokenizer_builder,
|
|
||||||
update: &mut self.raw_builder.document_update(document_id)?,
|
update: &mut self.raw_builder.document_update(document_id)?,
|
||||||
stop_words: stop_words,
|
stop_words: stop_words,
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user