mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
feat: Move tokenizer things into the meilidb-tokenizer workspace
This commit is contained in:
parent
d8cbb03c42
commit
1897da5348
10 changed files with 28 additions and 22 deletions
|
@ -13,6 +13,8 @@ hashbrown = { version = "0.1.8", features = ["serde"] }
|
|||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||
lockfree = "0.5.1"
|
||||
log = "0.4.6"
|
||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
|
@ -20,7 +22,6 @@ serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
|||
size_format = "1.0.2"
|
||||
slice-group-by = "0.2.4"
|
||||
unidecode = "0.3.0"
|
||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
|
|
|
@ -430,9 +430,9 @@ mod tests {
|
|||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use meilidb_tokenizer::DefaultBuilder;
|
||||
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
|
|
@ -3,13 +3,11 @@ use std::collections::HashSet;
|
|||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use meilidb_core::{DocumentId, DocIndex};
|
||||
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk};
|
||||
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::is_cjk;
|
||||
|
||||
pub struct IndexerSerializer<'a, 'b, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
|
|
|
@ -2,13 +2,13 @@ use std::collections::HashSet;
|
|||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use meilidb_tokenizer::TokenizerBuilder;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::value_to_number::ValueToNumberSerializer;
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ use serde::Serialize;
|
|||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::data::DocIds;
|
||||
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||
use meilidb_tokenizer::TokenizerBuilder;
|
||||
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
|
@ -16,7 +17,6 @@ use crate::database::schema::SchemaAttr;
|
|||
use crate::database::schema::Schema;
|
||||
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||
use crate::database::{RankedMap, Number};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
|
||||
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
||||
|
|
|
@ -1,24 +1,10 @@
|
|||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
|
||||
pub mod database;
|
||||
pub mod tokenizer;
|
||||
mod common_words;
|
||||
mod sort_by_attr;
|
||||
|
||||
pub use rocksdb;
|
||||
|
||||
pub use self::sort_by_attr::SortByAttr;
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
|
|
@ -1,259 +0,0 @@
|
|||
use std::mem;
|
||||
use crate::is_cjk;
|
||||
use self::Separator::*;
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
||||
}
|
||||
|
||||
pub struct DefaultBuilder;
|
||||
|
||||
impl DefaultBuilder {
|
||||
pub fn new() -> DefaultBuilder {
|
||||
DefaultBuilder
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Token<'a> {
|
||||
pub word: &'a str,
|
||||
pub word_index: usize,
|
||||
pub char_index: usize,
|
||||
}
|
||||
|
||||
impl TokenizerBuilder for DefaultBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
|
||||
Box::new(Tokenizer::new(text))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Tokenizer<'a> {
|
||||
word_index: usize,
|
||||
char_index: usize,
|
||||
inner: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> {
|
||||
pub fn new(string: &str) -> Tokenizer {
|
||||
let mut char_advance = 0;
|
||||
let mut index_advance = 0;
|
||||
for (n, (i, c)) in string.char_indices().enumerate() {
|
||||
char_advance = n;
|
||||
index_advance = i;
|
||||
if detect_separator(c).is_none() { break }
|
||||
}
|
||||
|
||||
Tokenizer {
|
||||
word_index: 0,
|
||||
char_index: char_advance,
|
||||
inner: &string[index_advance..],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum Separator {
|
||||
Short,
|
||||
Long,
|
||||
}
|
||||
|
||||
impl Separator {
|
||||
fn add(self, add: Separator) -> Separator {
|
||||
match (self, add) {
|
||||
(_, Long) => Long,
|
||||
(Short, Short) => Short,
|
||||
(Long, Short) => Long,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Short => 1,
|
||||
Long => 8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn detect_separator(c: char) -> Option<Separator> {
|
||||
match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Tokenizer<'a> {
|
||||
type Item = Token<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut start_word = None;
|
||||
let mut distance = None;
|
||||
|
||||
for (i, c) in self.inner.char_indices() {
|
||||
match detect_separator(c) {
|
||||
Some(sep) => {
|
||||
if let Some(start_word) = start_word {
|
||||
let (prefix, tail) = self.inner.split_at(i);
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
self.inner = tail;
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
self.char_index += word.chars().count();
|
||||
return Some(token)
|
||||
}
|
||||
|
||||
distance = Some(distance.map_or(sep, |s| s.add(sep)));
|
||||
},
|
||||
None => {
|
||||
// if this is a Chinese, a Japanese or a Korean character
|
||||
// See <http://unicode-table.com>
|
||||
if is_cjk(c) {
|
||||
match start_word {
|
||||
Some(start_word) => {
|
||||
let (prefix, tail) = self.inner.split_at(i);
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
self.inner = tail;
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
self.word_index += 1;
|
||||
self.char_index += word.chars().count();
|
||||
|
||||
return Some(token)
|
||||
},
|
||||
None => {
|
||||
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
|
||||
let (spaces, word) = prefix.split_at(i);
|
||||
|
||||
self.inner = tail;
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
if tail.chars().next().and_then(detect_separator).is_none() {
|
||||
self.word_index += 1;
|
||||
}
|
||||
self.char_index += 1;
|
||||
|
||||
return Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if start_word.is_none() { start_word = Some(i) }
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(start_word) = start_word {
|
||||
let prefix = mem::replace(&mut self.inner, "");
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
|
||||
char_index: self.char_index + spaces.chars().count(),
|
||||
};
|
||||
return Some(token)
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn easy() {
|
||||
let mut tokenizer = Tokenizer::new("salut");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_long_chars() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_kanjis() {
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue