From 1897da53483d7b7b36367923e642972b4b8f562f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Feb 2019 18:24:46 +0100 Subject: [PATCH] feat: Move tokenizer things into the meilidb-tokenizer workspace --- Cargo.toml | 1 + meilidb-core/Cargo.toml | 1 + meilidb-tokenizer/Cargo.toml | 8 ++++++++ .../mod.rs => meilidb-tokenizer/src/lib.rs | 13 ++++++++++++- meilidb/Cargo.toml | 3 ++- meilidb/src/database/mod.rs | 2 +- meilidb/src/database/serde/indexer_serializer.rs | 4 +--- meilidb/src/database/serde/serializer.rs | 2 +- meilidb/src/database/update/mod.rs | 2 +- meilidb/src/lib.rs | 14 -------------- 10 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 meilidb-tokenizer/Cargo.toml rename meilidb/src/tokenizer/mod.rs => meilidb-tokenizer/src/lib.rs (95%) diff --git a/Cargo.toml b/Cargo.toml index df9c871ba..139e8b472 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,4 +2,5 @@ members = [ "meilidb", "meilidb-core", + "meilidb-tokenizer", ] diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 2c5ec0680..fbac7dbe2 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -11,6 +11,7 @@ hashbrown = "0.1.8" lazy_static = "1.2.0" levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.6" +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" sdset = "0.3.1" serde = "1.0.88" diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml new file mode 100644 index 000000000..c2077533e --- /dev/null +++ b/meilidb-tokenizer/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "meilidb-tokenizer" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] + diff --git a/meilidb/src/tokenizer/mod.rs b/meilidb-tokenizer/src/lib.rs similarity index 95% rename from meilidb/src/tokenizer/mod.rs rename to meilidb-tokenizer/src/lib.rs index ed146c06f..7c4c8f915 100644 --- a/meilidb/src/tokenizer/mod.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,7 +1,18 @@ use std::mem; -use crate::is_cjk; use self::Separator::*; +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + pub trait TokenizerBuilder { fn build<'a>(&self, text: &'a str) -> Box> + 'a>; } diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 1600feb04..8a042168c 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -13,6 +13,8 @@ hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } lockfree = "0.5.1" log = "0.4.6" +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } sdset = "0.3.1" serde = "1.0.88" serde_derive = "1.0.88" @@ -20,7 +22,6 @@ serde_json = { version = "1.0.38", features = ["preserve_order"] } size_format = "1.0.2" slice-group-by = "0.2.4" unidecode = "0.3.0" -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs index 727a30bac..08ca6cd7f 100644 --- a/meilidb/src/database/mod.rs +++ b/meilidb/src/database/mod.rs @@ -430,9 +430,9 @@ mod tests { use std::error::Error; use serde_derive::{Serialize, Deserialize}; + use meilidb_tokenizer::DefaultBuilder; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - use crate::tokenizer::DefaultBuilder; use super::*; diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs index 2734fb3be..f718111dc 100644 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ b/meilidb/src/database/serde/indexer_serializer.rs @@ -3,13 +3,11 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; use meilidb_core::{DocumentId, DocIndex}; +use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; -use crate::tokenizer::TokenizerBuilder; -use crate::tokenizer::Token; -use crate::is_cjk; pub struct IndexerSerializer<'a, 'b, B> { pub tokenizer_builder: &'a B, diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs index 7e38f938e..8b3a05b46 100644 --- a/meilidb/src/database/serde/serializer.rs +++ b/meilidb/src/database/serde/serializer.rs @@ -2,13 +2,13 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; +use meilidb_tokenizer::TokenizerBuilder; use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::value_to_number::ValueToNumberSerializer; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; -use crate::tokenizer::TokenizerBuilder; use crate::database::schema::Schema; use meilidb_core::DocumentId; diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs index eaae462b2..720b7aaf3 100644 --- a/meilidb/src/database/update/mod.rs +++ b/meilidb/src/database/update/mod.rs @@ -8,6 +8,7 @@ use serde::Serialize; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::data::DocIds; use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; +use meilidb_tokenizer::TokenizerBuilder; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::serde::serializer::Serializer; @@ -16,7 +17,6 @@ use crate::database::schema::SchemaAttr; use crate::database::schema::Schema; use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; use crate::database::{RankedMap, Number}; -use crate::tokenizer::TokenizerBuilder; pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs index 73de9ff4d..325df65eb 100644 --- a/meilidb/src/lib.rs +++ b/meilidb/src/lib.rs @@ -1,24 +1,10 @@ #![cfg_attr(feature = "nightly", feature(test))] pub mod database; -pub mod tokenizer; mod common_words; mod sort_by_attr; pub use rocksdb; pub use self::sort_by_attr::SortByAttr; -pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; - -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -}