From 0088de9802177f2c851d1daf5f79d1a036c2168e Mon Sep 17 00:00:00 2001 From: mposmta Date: Wed, 25 Mar 2020 19:51:22 +0100 Subject: [PATCH 1/3] adds support for aligned crop in search result --- meilisearch-http/src/helpers/meilisearch.rs | 27 +++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 31510ec49..86e538f7e 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -372,6 +372,28 @@ pub struct SearchResult { pub query: String, } +fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { + + if context == 0 { + return (match_index, text.chars().skip(match_index).take_while(|c| c.is_alphanumeric()).count()); + } + + let word_end_index = |mut index| { + if let Some(true) = text.chars().nth(index - 1).map(|c| c.is_alphanumeric()) { + index += text.chars().skip(index).take_while(|c| c.is_alphanumeric()).count(); + } + index + }; + + let start = match match_index.saturating_sub(context) { + n if n == 0 => n, + n => word_end_index(n) + }; + let end = word_end_index(start + 2 * context); + + (start, end - start) +} + fn crop_text( text: &str, matches: impl IntoIterator, @@ -380,8 +402,9 @@ fn crop_text( let mut matches = matches.into_iter().peekable(); let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0); - let start = char_index.saturating_sub(context); - let text = text.chars().skip(start).take(context * 2).collect(); + let (start, count) = aligned_crop(text, char_index, context); + + let text = text.chars().skip(start).take(count).collect::().trim().into(); let matches = matches .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) From 2d1d59acb7fa97abfbd9b91bf674dce3b2a25745 Mon Sep 17 00:00:00 2001 From: mposmta Date: Thu, 26 Mar 2020 11:34:50 +0100 Subject: [PATCH 2/3] adds support for aligned cropping with cjk --- Cargo.lock | 1 + meilisearch-http/Cargo.toml | 1 + meilisearch-http/src/helpers/meilisearch.rs | 19 ++++++++++++------- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d19b9b5c..17569b566 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1081,6 +1081,7 @@ dependencies = [ "main_error 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "meilisearch-core 0.9.0", "meilisearch-schema 0.9.0", + "meilisearch-tokenizer 0.9.0", "mime 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)", "pretty-bytes 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index d98499643..219d6c51d 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -27,6 +27,7 @@ log = "0.4.8" main_error = "0.1.0" meilisearch-core = { path = "../meilisearch-core", version = "0.9.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.9.0" } +meilisearch-tokenizer = {path = "../meilisearch-tokenizer", version = "0.9.0"} mime = "0.3.16" pretty-bytes = "0.2.2" rand = "0.7.2" diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 86e538f7e..8c823f33c 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -11,6 +11,7 @@ use log::error; use meilisearch_core::criterion::*; use meilisearch_core::settings::RankingRule; use meilisearch_core::{Highlight, Index, MainT, RankedMap}; +use meilisearch_tokenizer::is_cjk; use meilisearch_schema::{FieldId, Schema}; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -372,19 +373,21 @@ pub struct SearchResult { pub query: String, } +/// returns the start index and the length on the crop. fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { - - if context == 0 { - return (match_index, text.chars().skip(match_index).take_while(|c| c.is_alphanumeric()).count()); - } + let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c); let word_end_index = |mut index| { - if let Some(true) = text.chars().nth(index - 1).map(|c| c.is_alphanumeric()) { - index += text.chars().skip(index).take_while(|c| c.is_alphanumeric()).count(); + if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) { + index += text.chars().skip(index).take_while(is_word_component).count(); } index }; + if context == 0 { + // count need to be at least 1 for cjk queries to return something + return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count()); + } let start = match match_index.saturating_sub(context) { n if n == 0 => n, n => word_end_index(n) @@ -404,8 +407,10 @@ fn crop_text( let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0); let (start, count) = aligned_crop(text, char_index, context); - let text = text.chars().skip(start).take(count).collect::().trim().into(); + //TODO do something about the double allocation + let text = text.chars().skip(start).take(count).collect::().trim().to_string(); + // update matches index to match the new cropped text let matches = matches .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) .map(|match_| Highlight { From 158c2b538294a584d9fe536a3d2592066336069a Mon Sep 17 00:00:00 2001 From: mposmta Date: Thu, 26 Mar 2020 14:44:03 +0100 Subject: [PATCH 3/3] tests aligned crop --- meilisearch-http/src/helpers/meilisearch.rs | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 8c823f33c..8408d4904 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -532,6 +532,39 @@ fn calculate_highlights( mod tests { use super::*; + #[test] + fn aligned_crops() { + let text = r#"En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."#; + + // simple test + let (start, length) = aligned_crop(&text, 6, 2); + let cropped = text.chars().skip(start).take(length).collect::().trim().to_string(); + assert_eq!("début", cropped); + + // first word test + let (start, length) = aligned_crop(&text, 0, 1); + let cropped = text.chars().skip(start).take(length).collect::().trim().to_string(); + assert_eq!("En", cropped); + // last word test + let (start, length) = aligned_crop(&text, 510, 2); + let cropped = text.chars().skip(start).take(length).collect::().trim().to_string(); + assert_eq!("Fondation", cropped); + + // CJK tests + let text = "this isのス foo myタイリ test"; + + // mixed charset + let (start, length) = aligned_crop(&text, 5, 3); + let cropped = text.chars().skip(start).take(length).collect::().trim().to_string(); + assert_eq!("isのス", cropped); + + // split regular word / CJK word, no space + let (start, length) = aligned_crop(&text, 7, 1); + let cropped = text.chars().skip(start).take(length).collect::().trim().to_string(); + assert_eq!("のス", cropped); + + } + #[test] fn calculate_highlights() { let data = r#"{