mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Merge pull request #119 from Kerollmops/dont-be-hurry
Fix the tokenizer (next time don't be so hurry to merge)
This commit is contained in:
commit
64971de7ed
@ -21,9 +21,12 @@ serde = "1.0"
|
|||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
serde_json = { version = "1.0", features = ["preserve_order"] }
|
serde_json = { version = "1.0", features = ["preserve_order"] }
|
||||||
size_format = "1.0"
|
size_format = "1.0"
|
||||||
slice-group-by = "0.2"
|
|
||||||
unidecode = "0.3"
|
unidecode = "0.3"
|
||||||
|
|
||||||
|
[dependencies.slice-group-by]
|
||||||
|
git = "https://github.com/Kerollmops/slice-group-by.git"
|
||||||
|
tag = "v0.2.3-alpha.1"
|
||||||
|
|
||||||
[dependencies.toml]
|
[dependencies.toml]
|
||||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||||
features = ["preserve_order"]
|
features = ["preserve_order"]
|
||||||
|
12
src/lib.rs
12
src/lib.rs
@ -16,6 +16,18 @@ pub use rocksdb;
|
|||||||
pub use self::tokenizer::Tokenizer;
|
pub use self::tokenizer::Tokenizer;
|
||||||
pub use self::common_words::CommonWords;
|
pub use self::common_words::CommonWords;
|
||||||
|
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
/// Represent an internally generated document unique identifier.
|
||||||
///
|
///
|
||||||
/// It is used to inform the database the document you want to deserialize.
|
/// It is used to inform the database the document you want to deserialize.
|
||||||
|
@ -6,7 +6,7 @@ use std::hash::Hash;
|
|||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
|
||||||
use rayon::slice::ParallelSliceMut;
|
use rayon::slice::ParallelSliceMut;
|
||||||
use slice_group_by::GroupByMut;
|
use slice_group_by::{GroupByMut, LinearStrGroupBy};
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use rocksdb::DB;
|
use rocksdb::DB;
|
||||||
@ -16,17 +16,43 @@ use crate::automaton::{self, DfaExt, AutomatonExt};
|
|||||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||||
use crate::rank::criterion::Criteria;
|
use crate::rank::criterion::Criteria;
|
||||||
use crate::database::DatabaseView;
|
use crate::database::DatabaseView;
|
||||||
use crate::{Match, DocumentId};
|
|
||||||
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||||
|
use crate::{is_cjk, Match, DocumentId};
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
enum CharCategory {
|
||||||
|
Space,
|
||||||
|
Cjk,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_char(c: char) -> CharCategory {
|
||||||
|
if c.is_whitespace() { CharCategory::Space }
|
||||||
|
else if is_cjk(c) { CharCategory::Cjk }
|
||||||
|
else { CharCategory::Other }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_word(s: &&str) -> bool {
|
||||||
|
!s.chars().any(char::is_whitespace)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn same_group_category(a: char, b: char) -> bool {
|
||||||
|
let ca = classify_char(a);
|
||||||
|
let cb = classify_char(b);
|
||||||
|
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
|
||||||
|
}
|
||||||
|
|
||||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let mut automatons = Vec::new();
|
let mut groups = LinearStrGroupBy::new(query, same_group_category)
|
||||||
let mut words = query.split_whitespace().map(str::to_lowercase).peekable();
|
.filter(is_word)
|
||||||
|
.map(str::to_lowercase)
|
||||||
|
.peekable();
|
||||||
|
|
||||||
while let Some(word) = words.next() {
|
let mut automatons = Vec::new();
|
||||||
let has_following_word = words.peek().is_some();
|
while let Some(word) = groups.next() {
|
||||||
let lev = if has_following_word || has_end_whitespace {
|
let has_following_word = groups.peek().is_some();
|
||||||
|
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
|
||||||
automaton::build_dfa(&word)
|
automaton::build_dfa(&word)
|
||||||
} else {
|
} else {
|
||||||
automaton::build_prefix_dfa(&word)
|
automaton::build_prefix_dfa(&word)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
|
use crate::is_cjk;
|
||||||
use self::Separator::*;
|
use self::Separator::*;
|
||||||
|
|
||||||
pub trait TokenizerBuilder {
|
pub trait TokenizerBuilder {
|
||||||
@ -105,8 +106,6 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
char_index: self.char_index,
|
char_index: self.char_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("no-cjk with start_word returns: {:?}", token);
|
|
||||||
|
|
||||||
self.char_index += word.chars().count();
|
self.char_index += word.chars().count();
|
||||||
return Some(token)
|
return Some(token)
|
||||||
}
|
}
|
||||||
@ -116,18 +115,7 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
None => {
|
None => {
|
||||||
// if this is a Chinese, a Japanese or a Korean character
|
// if this is a Chinese, a Japanese or a Korean character
|
||||||
// See <http://unicode-table.com>
|
// See <http://unicode-table.com>
|
||||||
if (c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
if is_cjk(c) {
|
||||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
|
||||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
|
||||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
|
||||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
|
||||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
|
||||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
|
||||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
|
||||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
|
||||||
{
|
|
||||||
let char_len = c.len_utf8();
|
|
||||||
|
|
||||||
match start_word {
|
match start_word {
|
||||||
Some(start_word) => {
|
Some(start_word) => {
|
||||||
let (prefix, tail) = self.inner.split_at(i);
|
let (prefix, tail) = self.inner.split_at(i);
|
||||||
@ -143,15 +131,13 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
char_index: self.char_index,
|
char_index: self.char_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("cjk with start_word returns: {:?}", token);
|
|
||||||
|
|
||||||
self.word_index += 1;
|
self.word_index += 1;
|
||||||
self.char_index += word.chars().count();
|
self.char_index += word.chars().count();
|
||||||
|
|
||||||
return Some(token)
|
return Some(token)
|
||||||
},
|
},
|
||||||
None => {
|
None => {
|
||||||
let (prefix, tail) = self.inner.split_at(i + char_len);
|
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
|
||||||
let (spaces, word) = prefix.split_at(i);
|
let (spaces, word) = prefix.split_at(i);
|
||||||
|
|
||||||
self.inner = tail;
|
self.inner = tail;
|
||||||
@ -164,12 +150,10 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
char_index: self.char_index,
|
char_index: self.char_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("cjk without start_word returns: {:?}", token);
|
|
||||||
|
|
||||||
if tail.chars().next().and_then(detect_separator).is_none() {
|
if tail.chars().next().and_then(detect_separator).is_none() {
|
||||||
self.word_index += 1;
|
self.word_index += 1;
|
||||||
}
|
}
|
||||||
self.char_index += char_len;
|
self.char_index += 1;
|
||||||
|
|
||||||
return Some(token)
|
return Some(token)
|
||||||
}
|
}
|
||||||
@ -258,18 +242,18 @@ mod tests {
|
|||||||
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 3 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 10 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 3 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 6 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 10 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 20 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 29 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user