mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
integration with new tokenizer wip
This commit is contained in:
parent
8a4d05b7bb
commit
5e00842087
5 changed files with 94 additions and 60 deletions
|
@ -11,7 +11,6 @@ use meilisearch_core::criterion::*;
|
|||
use meilisearch_core::settings::RankingRule;
|
||||
use meilisearch_core::{Highlight, Index, RankedMap};
|
||||
use meilisearch_schema::{FieldId, Schema};
|
||||
use meilisearch_tokenizer::is_cjk;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use siphasher::sip::SipHasher;
|
||||
|
@ -344,7 +343,7 @@ pub struct SearchResult {
|
|||
|
||||
/// returns the start index and the length on the crop.
|
||||
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
|
||||
let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
|
||||
let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c);
|
||||
|
||||
let word_end_index = |mut index| {
|
||||
if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
|
||||
|
|
|
@ -5,3 +5,22 @@ pub mod compression;
|
|||
|
||||
pub use authentication::Authentication;
|
||||
pub use normalize_path::NormalizePath;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo
|
||||
|| (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement
|
||||
|| (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical
|
||||
|| (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation
|
||||
|| (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana
|
||||
|| (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana
|
||||
|| (c >= '\u{3100}' && c <= '\u{312f}')
|
||||
|| (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo
|
||||
|| (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months
|
||||
|| (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A
|
||||
|| (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs
|
||||
|| (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A
|
||||
|| (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables
|
||||
|| (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B
|
||||
|| (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs
|
||||
|| (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue