mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
feat: Use the new Tokenizer
This commit is contained in:
parent
397522f277
commit
87f9528791
@ -430,7 +430,6 @@ mod tests {
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use meilidb_tokenizer::DefaultBuilder;
|
||||
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
|
||||
@ -478,11 +477,10 @@ mod tests {
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
let docid0 = builder.update_document(&doc0, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &stop_words)?;
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
@ -549,16 +547,14 @@ mod tests {
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
let docid0 = builder.update_document(&doc0, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &stop_words)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
let docid2 = builder.update_document(&doc2, &stop_words)?;
|
||||
let docid3 = builder.update_document(&doc3, &stop_words)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
@ -640,7 +636,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -650,7 +645,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
@ -688,7 +683,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -698,7 +692,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
@ -737,7 +731,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -747,7 +740,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
@ -785,7 +778,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -795,7 +787,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
@ -833,7 +825,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -843,7 +834,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
@ -882,7 +873,6 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
@ -892,7 +882,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
builder.update_document(&document, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
|
@ -3,23 +3,20 @@ use std::collections::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use meilidb_core::{DocumentId, DocIndex};
|
||||
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk};
|
||||
use meilidb_tokenizer::{Tokenizer, Token, is_cjk};
|
||||
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
|
||||
pub struct IndexerSerializer<'a, 'b, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub struct IndexerSerializer<'a, 'b> {
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
@ -49,7 +46,7 @@ where B: TokenizerBuilder
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for token in self.tokenizer_builder.build(v) {
|
||||
for token in Tokenizer::new(v) {
|
||||
let Token { word, word_index, char_index } = token;
|
||||
let document_id = self.document_id;
|
||||
|
||||
|
@ -2,7 +2,6 @@ use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use meilidb_tokenizer::TokenizerBuilder;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
@ -12,25 +11,22 @@ use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::Schema;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, 'b, B> {
|
||||
pub struct Serializer<'a, 'b> {
|
||||
pub schema: &'a Schema,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, 'b, B>;
|
||||
type SerializeStruct = StructSerializer<'a, 'b, B>;
|
||||
type SerializeMap = MapSerializer<'a, 'b>;
|
||||
type SerializeStruct = StructSerializer<'a, 'b>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
@ -142,7 +138,6 @@ where B: TokenizerBuilder
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
current_key_name: None,
|
||||
})
|
||||
@ -158,7 +153,6 @@ where B: TokenizerBuilder
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
}
|
||||
@ -175,18 +169,15 @@ where B: TokenizerBuilder
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, 'b, B> {
|
||||
pub struct MapSerializer<'a, 'b> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
@ -223,7 +214,6 @@ where B: TokenizerBuilder
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
@ -244,17 +234,14 @@ where B: TokenizerBuilder
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, 'b, B> {
|
||||
pub struct StructSerializer<'a, 'b> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
@ -274,7 +261,6 @@ where B: TokenizerBuilder
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
|
@ -8,7 +8,6 @@ use serde::Serialize;
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::data::DocIds;
|
||||
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||
use meilidb_tokenizer::TokenizerBuilder;
|
||||
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
@ -36,21 +35,18 @@ impl Update {
|
||||
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
pub fn update_document<T>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: &mut self.raw_builder.document_update(document_id)?,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user