better separator handling

This commit is contained in:
mpostma 2020-11-26 13:16:12 +01:00 committed by many
parent e616b1e356
commit 6527d3e492
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
2 changed files with 50 additions and 14 deletions

View File

@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use log::debug;
use meilisearch_tokenizer::{Token, token::SeparatorKind};
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation};
@ -181,11 +181,23 @@ fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, Strin
analyzer
.analyze(s)
.tokens()
.scan(0, |offset, mut token| {
token.char_index += *offset;
if let Some(SeparatorKind::Hard) = token.is_separator() {
.scan((0, None), |(offset, sepcat), mut token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
}
Some(token)
})
.filter(|t| t.is_word())

View File

@ -4,7 +4,7 @@ use std::convert::TryFrom;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::{Token, token::SeparatorKind};
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use sdset::SetBuf;
use crate::{DocIndex, DocumentId};
@ -45,11 +45,23 @@ impl RawIndexer {
let analyzed_text = self.analyzer.analyze(text);
for (word_pos, token) in analyzed_text.tokens()
.scan(0, |offset, mut token| {
token.char_index += *offset;
if let Some(SeparatorKind::Hard) = token.is_separator() {
.scan((0, None), |(offset, sepcat), mut token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
}
Some(token)
})
.filter(|t| t.is_word())
@ -88,11 +100,23 @@ impl RawIndexer {
let analyzed_text = self.analyzer.analyze(s);
let tokens = analyzed_text
.tokens()
.scan(0, |offset, mut token| {
token.char_index += *offset;
if let Some(SeparatorKind::Hard) = token.is_separator() {
.scan((0, None), |(offset, sepcat), mut token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
if let Some(SeparatorKind::Hard) = sepcat {
*offset += 8;
}
*sepcat = None;
token.char_index += *offset;
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard);
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
}
_ => (),
}
Some(token)
})
.filter(|t| t.is_word())