mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
better separator handling
This commit is contained in:
parent
e616b1e356
commit
6527d3e492
@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
|
|||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use itertools::{EitherOrBoth, merge_join_by};
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
|
||||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
|
|
||||||
@ -181,11 +181,23 @@ fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, Strin
|
|||||||
analyzer
|
analyzer
|
||||||
.analyze(s)
|
.analyze(s)
|
||||||
.tokens()
|
.tokens()
|
||||||
.scan(0, |offset, mut token| {
|
.scan((0, None), |(offset, sepcat), mut token| {
|
||||||
token.char_index += *offset;
|
match token.kind {
|
||||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||||
|
if let Some(SeparatorKind::Hard) = sepcat {
|
||||||
*offset += 8;
|
*offset += 8;
|
||||||
}
|
}
|
||||||
|
*sepcat = None;
|
||||||
|
token.char_index += *offset;
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Hard);
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Soft);
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
Some(token)
|
Some(token)
|
||||||
})
|
})
|
||||||
.filter(|t| t.is_word())
|
.filter(|t| t.is_word())
|
||||||
|
@ -4,7 +4,7 @@ use std::convert::TryFrom;
|
|||||||
|
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::{DocIndex, DocumentId};
|
use crate::{DocIndex, DocumentId};
|
||||||
@ -45,11 +45,23 @@ impl RawIndexer {
|
|||||||
|
|
||||||
let analyzed_text = self.analyzer.analyze(text);
|
let analyzed_text = self.analyzer.analyze(text);
|
||||||
for (word_pos, token) in analyzed_text.tokens()
|
for (word_pos, token) in analyzed_text.tokens()
|
||||||
.scan(0, |offset, mut token| {
|
.scan((0, None), |(offset, sepcat), mut token| {
|
||||||
token.char_index += *offset;
|
match token.kind {
|
||||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||||
|
if let Some(SeparatorKind::Hard) = sepcat {
|
||||||
*offset += 8;
|
*offset += 8;
|
||||||
}
|
}
|
||||||
|
*sepcat = None;
|
||||||
|
token.char_index += *offset;
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Hard);
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Soft);
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
Some(token)
|
Some(token)
|
||||||
})
|
})
|
||||||
.filter(|t| t.is_word())
|
.filter(|t| t.is_word())
|
||||||
@ -88,11 +100,23 @@ impl RawIndexer {
|
|||||||
let analyzed_text = self.analyzer.analyze(s);
|
let analyzed_text = self.analyzer.analyze(s);
|
||||||
let tokens = analyzed_text
|
let tokens = analyzed_text
|
||||||
.tokens()
|
.tokens()
|
||||||
.scan(0, |offset, mut token| {
|
.scan((0, None), |(offset, sepcat), mut token| {
|
||||||
token.char_index += *offset;
|
match token.kind {
|
||||||
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
|
||||||
|
if let Some(SeparatorKind::Hard) = sepcat {
|
||||||
*offset += 8;
|
*offset += 8;
|
||||||
}
|
}
|
||||||
|
*sepcat = None;
|
||||||
|
token.char_index += *offset;
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Hard);
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
|
||||||
|
*sepcat = Some(SeparatorKind::Soft);
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
Some(token)
|
Some(token)
|
||||||
})
|
})
|
||||||
.filter(|t| t.is_word())
|
.filter(|t| t.is_word())
|
||||||
|
Loading…
Reference in New Issue
Block a user