mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
better separator handling
This commit is contained in:
parent
e616b1e356
commit
6527d3e492
@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use itertools::{EitherOrBoth, merge_join_by};
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
||||
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
|
||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||
use sdset::{Set, SetBuf, SetOperation};
|
||||
|
||||
@ -181,11 +181,23 @@ fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, Strin
|
||||
analyzer
|
||||
.analyze(s)
|
||||
.tokens()
|
||||
.scan(0, |offset, mut token| {
|
||||
token.char_index += *offset;
|
||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||
.scan((0, None), |(offset, sepcat), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||
if let Some(SeparatorKind::Hard) = sepcat {
|
||||
*offset += 8;
|
||||
}
|
||||
*sepcat = None;
|
||||
token.char_index += *offset;
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Hard);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Soft);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some(token)
|
||||
})
|
||||
.filter(|t| t.is_word())
|
||||
|
@ -4,7 +4,7 @@ use std::convert::TryFrom;
|
||||
|
||||
use meilisearch_schema::IndexedPos;
|
||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
||||
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::{DocIndex, DocumentId};
|
||||
@ -45,11 +45,23 @@ impl RawIndexer {
|
||||
|
||||
let analyzed_text = self.analyzer.analyze(text);
|
||||
for (word_pos, token) in analyzed_text.tokens()
|
||||
.scan(0, |offset, mut token| {
|
||||
token.char_index += *offset;
|
||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||
.scan((0, None), |(offset, sepcat), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||
if let Some(SeparatorKind::Hard) = sepcat {
|
||||
*offset += 8;
|
||||
}
|
||||
*sepcat = None;
|
||||
token.char_index += *offset;
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Hard);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Soft);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some(token)
|
||||
})
|
||||
.filter(|t| t.is_word())
|
||||
@ -88,11 +100,23 @@ impl RawIndexer {
|
||||
let analyzed_text = self.analyzer.analyze(s);
|
||||
let tokens = analyzed_text
|
||||
.tokens()
|
||||
.scan(0, |offset, mut token| {
|
||||
token.char_index += *offset;
|
||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||
.scan((0, None), |(offset, sepcat), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||
if let Some(SeparatorKind::Hard) = sepcat {
|
||||
*offset += 8;
|
||||
}
|
||||
*sepcat = None;
|
||||
token.char_index += *offset;
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Hard);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||
*sepcat = Some(SeparatorKind::Soft);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some(token)
|
||||
})
|
||||
.filter(|t| t.is_word())
|
||||
|
Loading…
Reference in New Issue
Block a user