mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 08:44:27 +01:00
integration with new tokenizer wip
This commit is contained in:
parent
8a4d05b7bb
commit
5e00842087
@ -2,13 +2,3 @@ mod dfa;
|
|||||||
|
|
||||||
|
|
||||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
|
|
||||||
pub fn normalize_str(string: &str) -> String {
|
|
||||||
let mut string = string.to_lowercase();
|
|
||||||
|
|
||||||
if !string.contains(is_cjk) {
|
|
||||||
string = deunicode::deunicode_with_tofu(&string, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
string
|
|
||||||
}
|
|
||||||
|
@ -7,13 +7,14 @@ use std::{cmp, fmt, iter::once};
|
|||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use itertools::{EitherOrBoth, merge_join_by};
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
use meilisearch_tokenizer::split_query_string;
|
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
use meilisearch_tokenizer::Token;
|
||||||
|
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
|
use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
|
||||||
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
use crate::QueryWordsMapper;
|
use crate::QueryWordsMapper;
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
@ -146,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||||
let words = normalize_str(&words.join(" "));
|
let words = &words.join(" ");
|
||||||
let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?;
|
let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?;
|
||||||
|
|
||||||
let mut strings = Vec::new();
|
let mut strings = Vec::new();
|
||||||
@ -174,15 +175,25 @@ where I: IntoIterator<Item=Operation>,
|
|||||||
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
|
|
||||||
|
fn split_query_string(s: &str) -> Vec<(usize, String)> {
|
||||||
|
// TODO: Use global instance instead
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::default());
|
||||||
|
analyzer
|
||||||
|
.analyze(s)
|
||||||
|
.tokens()
|
||||||
|
.filter(|t| !t.is_stopword())
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, Token { word, .. })| (i, word.to_string()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn create_query_tree(
|
pub fn create_query_tree(
|
||||||
reader: &heed::RoTxn<MainT>,
|
reader: &heed::RoTxn<MainT>,
|
||||||
ctx: &Context,
|
ctx: &Context,
|
||||||
query: &str,
|
query: &str,
|
||||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
{
|
{
|
||||||
let words = split_query_string(query).map(str::to_lowercase);
|
let words = split_query_string(query);
|
||||||
let words = words.filter(|w| !ctx.stop_words.contains(w));
|
|
||||||
let words: Vec<_> = words.enumerate().collect();
|
|
||||||
|
|
||||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
|
|
||||||
|
@ -2,8 +2,9 @@ use std::borrow::Cow;
|
|||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
use deunicode::deunicode_with_tofu;
|
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
|
use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
|
use meilisearch_tokenizer::Token;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::{DocIndex, DocumentId};
|
use crate::{DocIndex, DocumentId};
|
||||||
@ -18,6 +19,7 @@ pub struct RawIndexer<A> {
|
|||||||
stop_words: fst::Set<A>,
|
stop_words: fst::Set<A>,
|
||||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||||
|
analyzer: Analyzer,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Indexed<'a> {
|
pub struct Indexed<'a> {
|
||||||
@ -36,6 +38,7 @@ impl<A> RawIndexer<A> {
|
|||||||
stop_words,
|
stop_words,
|
||||||
words_doc_indexes: BTreeMap::new(),
|
words_doc_indexes: BTreeMap::new(),
|
||||||
docs_words: HashMap::new(),
|
docs_words: HashMap::new(),
|
||||||
|
analyzer: Analyzer::new(AnalyzerConfig::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -44,9 +47,12 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
||||||
let mut number_of_words = 0;
|
let mut number_of_words = 0;
|
||||||
|
|
||||||
for token in Tokenizer::new(text) {
|
let analyzed_text = self.analyzer.analyze(text);
|
||||||
|
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() {
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
|
token_index,
|
||||||
|
word_pos,
|
||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
@ -69,10 +75,36 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
where
|
where
|
||||||
I: IntoIterator<Item = &'s str>,
|
I: IntoIterator<Item = &'s str>,
|
||||||
{
|
{
|
||||||
let iter = iter.into_iter();
|
let mut token_index_offset = 0;
|
||||||
for token in SeqTokenizer::new(iter) {
|
let mut byte_offset = 0;
|
||||||
|
let mut word_offset = 0;
|
||||||
|
|
||||||
|
for s in iter.into_iter() {
|
||||||
|
let current_token_index_offset = token_index_offset;
|
||||||
|
let current_byte_offset = byte_offset;
|
||||||
|
let current_word_offset = word_offset;
|
||||||
|
|
||||||
|
let analyzed_text = self.analyzer.analyze(s);
|
||||||
|
let tokens = analyzed_text
|
||||||
|
.tokens()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, mut t)| {
|
||||||
|
t.byte_start = t.byte_start + current_byte_offset;
|
||||||
|
t.byte_end = t.byte_end + current_byte_offset;
|
||||||
|
(i + current_token_index_offset, t)
|
||||||
|
})
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, t)| (i + current_word_offset, t));
|
||||||
|
|
||||||
|
for (word_pos, (token_index, token)) in tokens {
|
||||||
|
token_index_offset = token_index + 1;
|
||||||
|
word_offset = word_pos + 1;
|
||||||
|
byte_offset = token.byte_end + 1;
|
||||||
|
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
|
token_index,
|
||||||
|
word_pos,
|
||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
@ -86,6 +118,7 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Indexed<'static> {
|
pub fn build(self) -> Indexed<'static> {
|
||||||
let words_doc_indexes = self
|
let words_doc_indexes = self
|
||||||
@ -114,6 +147,8 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
|
|||||||
|
|
||||||
fn index_token<A>(
|
fn index_token<A>(
|
||||||
token: Token,
|
token: Token,
|
||||||
|
position: usize,
|
||||||
|
word_pos: usize,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
@ -123,20 +158,14 @@ fn index_token<A>(
|
|||||||
) -> bool
|
) -> bool
|
||||||
where A: AsRef<[u8]>,
|
where A: AsRef<[u8]>,
|
||||||
{
|
{
|
||||||
if token.index >= word_limit {
|
if position >= word_limit {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let lower = token.word.to_lowercase();
|
if !stop_words.contains(&token.word.as_ref()) {
|
||||||
let token = Token {
|
match token_to_docindex(id, indexed_pos, &token, word_pos) {
|
||||||
word: &lower,
|
|
||||||
..token
|
|
||||||
};
|
|
||||||
|
|
||||||
if !stop_words.contains(&token.word) {
|
|
||||||
match token_to_docindex(id, indexed_pos, token) {
|
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word.as_ref());
|
||||||
|
|
||||||
if word.len() <= WORD_LENGTH_LIMIT {
|
if word.len() <= WORD_LENGTH_LIMIT {
|
||||||
words_doc_indexes
|
words_doc_indexes
|
||||||
@ -144,20 +173,6 @@ where A: AsRef<[u8]>,
|
|||||||
.or_insert_with(Vec::new)
|
.or_insert_with(Vec::new)
|
||||||
.push(docindex);
|
.push(docindex);
|
||||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||||
|
|
||||||
if !lower.contains(is_cjk) {
|
|
||||||
let unidecoded = deunicode_with_tofu(&lower, "");
|
|
||||||
if unidecoded != lower && !unidecoded.is_empty() {
|
|
||||||
let word = Vec::from(unidecoded);
|
|
||||||
if word.len() <= WORD_LENGTH_LIMIT {
|
|
||||||
words_doc_indexes
|
|
||||||
.entry(word.clone())
|
|
||||||
.or_insert_with(Vec::new)
|
|
||||||
.push(docindex);
|
|
||||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => return false,
|
None => return false,
|
||||||
@ -167,8 +182,8 @@ where A: AsRef<[u8]>,
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
|
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option<DocIndex> {
|
||||||
let word_index = u16::try_from(token.word_index).ok()?;
|
let word_index = u16::try_from(word_index).ok()?;
|
||||||
let char_index = u16::try_from(token.char_index).ok()?;
|
let char_index = u16::try_from(token.char_index).ok()?;
|
||||||
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ use meilisearch_core::criterion::*;
|
|||||||
use meilisearch_core::settings::RankingRule;
|
use meilisearch_core::settings::RankingRule;
|
||||||
use meilisearch_core::{Highlight, Index, RankedMap};
|
use meilisearch_core::{Highlight, Index, RankedMap};
|
||||||
use meilisearch_schema::{FieldId, Schema};
|
use meilisearch_schema::{FieldId, Schema};
|
||||||
use meilisearch_tokenizer::is_cjk;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use siphasher::sip::SipHasher;
|
use siphasher::sip::SipHasher;
|
||||||
@ -344,7 +343,7 @@ pub struct SearchResult {
|
|||||||
|
|
||||||
/// returns the start index and the length on the crop.
|
/// returns the start index and the length on the crop.
|
||||||
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
|
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
|
||||||
let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
|
let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c);
|
||||||
|
|
||||||
let word_end_index = |mut index| {
|
let word_end_index = |mut index| {
|
||||||
if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
|
if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
|
||||||
|
@ -5,3 +5,22 @@ pub mod compression;
|
|||||||
|
|
||||||
pub use authentication::Authentication;
|
pub use authentication::Authentication;
|
||||||
pub use normalize_path::NormalizePath;
|
pub use normalize_path::NormalizePath;
|
||||||
|
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo
|
||||||
|
|| (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement
|
||||||
|
|| (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical
|
||||||
|
|| (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation
|
||||||
|
|| (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana
|
||||||
|
|| (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana
|
||||||
|
|| (c >= '\u{3100}' && c <= '\u{312f}')
|
||||||
|
|| (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo
|
||||||
|
|| (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months
|
||||||
|
|| (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A
|
||||||
|
|| (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs
|
||||||
|
|| (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A
|
||||||
|
|| (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables
|
||||||
|
|| (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B
|
||||||
|
|| (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs
|
||||||
|
|| (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user