From a960c325f30f38be6a63634b3bd621daf82912a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 23 Feb 2019 14:57:13 +0100 Subject: [PATCH] feat: Make query strings support cjk kanjis --- Cargo.toml | 5 ++++- src/lib.rs | 12 ++++++++++++ src/rank/query_builder.rs | 40 ++++++++++++++++++++++++++++++++------- src/tokenizer/mod.rs | 12 ++---------- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cffc51348..37e7ea680 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,9 +21,12 @@ serde = "1.0" serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } size_format = "1.0" -slice-group-by = "0.2" unidecode = "0.3" +[dependencies.slice-group-by] +git = "https://github.com/Kerollmops/slice-group-by.git" +tag = "v0.2.3-alpha.1" + [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" features = ["preserve_order"] diff --git a/src/lib.rs b/src/lib.rs index a111b5049..e77e03ecb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,18 @@ pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index f9415b638..0f3643554 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -6,7 +6,7 @@ use std::hash::Hash; use std::rc::Rc; use rayon::slice::ParallelSliceMut; -use slice_group_by::GroupByMut; +use slice_group_by::{GroupByMut, LinearStrGroupBy}; use hashbrown::HashMap; use fst::Streamer; use rocksdb::DB; @@ -16,17 +16,43 @@ use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::criterion::Criteria; use crate::database::DatabaseView; -use crate::{Match, DocumentId}; use crate::rank::{raw_documents_from_matches, RawDocument, Document}; +use crate::{is_cjk, Match, DocumentId}; + +#[derive(Debug, PartialEq, Eq)] +enum CharCategory { + Space, + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if c.is_whitespace() { CharCategory::Space } + else if is_cjk(c) { CharCategory::Cjk } + else { CharCategory::Other } +} + +fn is_word(s: &&str) -> bool { + !s.chars().any(char::is_whitespace) +} + +fn same_group_category(a: char, b: char) -> bool { + let ca = classify_char(a); + let cb = classify_char(b); + if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } +} fn split_whitespace_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut automatons = Vec::new(); - let mut words = query.split_whitespace().map(str::to_lowercase).peekable(); + let mut groups = LinearStrGroupBy::new(query, same_group_category) + .filter(is_word) + .map(str::to_lowercase) + .peekable(); - while let Some(word) = words.next() { - let has_following_word = words.peek().is_some(); - let lev = if has_following_word || has_end_whitespace { + let mut automatons = Vec::new(); + while let Some(word) = groups.next() { + let has_following_word = groups.peek().is_some(); + let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { automaton::build_dfa(&word) } else { automaton::build_prefix_dfa(&word) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index f4c42b7d4..ed146c06f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,4 +1,5 @@ use std::mem; +use crate::is_cjk; use self::Separator::*; pub trait TokenizerBuilder { @@ -114,16 +115,7 @@ impl<'a> Iterator for Tokenizer<'a> { None => { // if this is a Chinese, a Japanese or a Korean character // See - if (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') - { + if is_cjk(c) { match start_word { Some(start_word) => { let (prefix, tail) = self.inner.split_at(i);