adds support for aligned cropping with cjk

2025-07-03 11:57:07 +02:00 · 2020-03-26 11:34:50 +01:00 · 2020-03-26 11:34:50 +01:00 · 2d1d59acb7
commit 2d1d59acb7
parent 0088de9802
3 changed files with 14 additions and 7 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1081,6 +1081,7 @@ dependencies = [
 "main_error 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "meilisearch-core 0.9.0",
 "meilisearch-schema 0.9.0",
 "meilisearch-tokenizer 0.9.0",
 "mime 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)",
 "pretty-bytes 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/meilisearch-http/Cargo.toml
+++ b/meilisearch-http/Cargo.toml
@ -27,6 +27,7 @@ log = "0.4.8"
 main_error = "0.1.0"
 meilisearch-core = { path = "../meilisearch-core", version = "0.9.0" }
 meilisearch-schema = { path = "../meilisearch-schema", version = "0.9.0" }
 meilisearch-tokenizer = {path = "../meilisearch-tokenizer", version = "0.9.0"}
 mime = "0.3.16"
 pretty-bytes = "0.2.2"
 rand = "0.7.2"
--- a/meilisearch-http/src/helpers/meilisearch.rs
+++ b/meilisearch-http/src/helpers/meilisearch.rs
@ -11,6 +11,7 @@ use log::error;
 use meilisearch_core::criterion::*;
 use meilisearch_core::settings::RankingRule;
 use meilisearch_core::{Highlight, Index, MainT, RankedMap};
 use meilisearch_tokenizer::is_cjk;
 use meilisearch_schema::{FieldId, Schema};
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
@ -372,19 +373,21 @@ pub struct SearchResult {
    pub query: String,
 }
 /// returns the start index and the length on the crop. 
 fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
-
+    let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
    if context == 0 {
        return (match_index, text.chars().skip(match_index).take_while(|c| c.is_alphanumeric()).count());
    }
    let word_end_index = |mut index| {
-        if let Some(true) = text.chars().nth(index - 1).map(|c| c.is_alphanumeric()) {
+        if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
-            index += text.chars().skip(index).take_while(|c| c.is_alphanumeric()).count();
+            index += text.chars().skip(index).take_while(is_word_component).count();
        }
        index
    };
    if context == 0 {
        // count need to be at least 1 for cjk queries to return something
        return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count());
    }
    let start = match match_index.saturating_sub(context) {
        n if n == 0 => n,
        n => word_end_index(n)
@ -404,8 +407,10 @@ fn crop_text(
    let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
    let (start, count) = aligned_crop(text, char_index, context);
-    let text = text.chars().skip(start).take(count).collect::<String>().trim().into();
+    //TODO do something about the double allocation
    let text = text.chars().skip(start).take(count).collect::<String>().trim().to_string();
    // update matches index to match the new cropped text
    let matches = matches
        .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
        .map(|match_| Highlight {