better separator handling

This commit is contained in:
mpostma 2020-11-26 13:16:12 +01:00 committed by many
parent e616b1e356
commit 6527d3e492
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
2 changed files with 50 additions and 14 deletions

View File

@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by}; use itertools::{EitherOrBoth, merge_join_by};
use log::debug; use log::debug;
use meilisearch_tokenizer::{Token, token::SeparatorKind}; use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation}; use sdset::{Set, SetBuf, SetOperation};
@ -181,10 +181,22 @@ fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, Strin
analyzer analyzer
.analyze(s) .analyze(s)
.tokens() .tokens()
.scan(0, |offset, mut token| { .scan((0, None), |(offset, sepcat), mut token| {
token.char_index += *offset; match token.kind {
if let Some(SeparatorKind::Hard) = token.is_separator() { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
*offset += 8; if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
} }
Some(token) Some(token)
}) })

View File

@ -4,7 +4,7 @@ use std::convert::TryFrom;
use meilisearch_schema::IndexedPos; use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::{Token, token::SeparatorKind}; use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use sdset::SetBuf; use sdset::SetBuf;
use crate::{DocIndex, DocumentId}; use crate::{DocIndex, DocumentId};
@ -45,10 +45,22 @@ impl RawIndexer {
let analyzed_text = self.analyzer.analyze(text); let analyzed_text = self.analyzer.analyze(text);
for (word_pos, token) in analyzed_text.tokens() for (word_pos, token) in analyzed_text.tokens()
.scan(0, |offset, mut token| { .scan((0, None), |(offset, sepcat), mut token| {
token.char_index += *offset; match token.kind {
if let Some(SeparatorKind::Hard) = token.is_separator() { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
*offset += 8; if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
} }
Some(token) Some(token)
}) })
@ -88,10 +100,22 @@ impl RawIndexer {
let analyzed_text = self.analyzer.analyze(s); let analyzed_text = self.analyzer.analyze(s);
let tokens = analyzed_text let tokens = analyzed_text
.tokens() .tokens()
.scan(0, |offset, mut token| { .scan((0, None), |(offset, sepcat), mut token| {
token.char_index += *offset; match token.kind {
if let Some(SeparatorKind::Hard) = token.is_separator() { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
*offset += 8; if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
} }
Some(token) Some(token)
}) })