From 6527d3e492f0dc91966fa5a1f16853608f38a75c Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 13:16:12 +0100 Subject: [PATCH] better separator handling --- meilisearch-core/src/query_tree.rs | 22 +++++++++++---- meilisearch-core/src/raw_indexer.rs | 42 ++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index f16f431fa..9be02d337 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::{Token, token::SeparatorKind}; +use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; @@ -181,10 +181,22 @@ fn split_query_string(s: &str, stop_words: HashSet) -> Vec<(usize, Strin analyzer .analyze(s) .tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) }) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 510717f4d..dd47ed5f2 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -4,7 +4,7 @@ use std::convert::TryFrom; use meilisearch_schema::IndexedPos; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; -use meilisearch_tokenizer::{Token, token::SeparatorKind}; +use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -45,10 +45,22 @@ impl RawIndexer { let analyzed_text = self.analyzer.analyze(text); for (word_pos, token) in analyzed_text.tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) }) @@ -88,10 +100,22 @@ impl RawIndexer { let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) })