mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
integrate with meilisearch tokenizer
This commit is contained in:
parent
7e1c94ab9c
commit
1ae761311e
10 changed files with 460 additions and 269 deletions
|
@ -12,7 +12,6 @@ pub mod facet;
|
|||
pub mod heed_codec;
|
||||
pub mod proximity;
|
||||
pub mod subcommand;
|
||||
pub mod tokenizer;
|
||||
pub mod update;
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
use std::str;
|
||||
use crate::tokenizer::{simple_tokenizer, TokenType};
|
||||
use meilisearch_tokenizer::{Token, TokenKind};
|
||||
|
||||
#[derive(Debug)]
|
||||
enum State {
|
||||
|
@ -18,138 +17,201 @@ impl State {
|
|||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryToken<'a> {
|
||||
Free(&'a str),
|
||||
Quoted(&'a str),
|
||||
Free(Token<'a>),
|
||||
Quoted(Token<'a>),
|
||||
}
|
||||
|
||||
pub struct QueryTokens<'a> {
|
||||
state: State,
|
||||
iter: Box<dyn Iterator<Item=(TokenType, &'a str)> + 'a>,
|
||||
}
|
||||
|
||||
impl QueryTokens<'_> {
|
||||
pub fn new(query: &str) -> QueryTokens {
|
||||
QueryTokens {
|
||||
state: State::Free,
|
||||
iter: Box::new(simple_tokenizer(query)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for QueryTokens<'a> {
|
||||
type Item = QueryToken<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
pub fn query_tokens<'a>(mut tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = QueryToken<'a>> {
|
||||
let mut state = State::Free;
|
||||
let f = move || {
|
||||
loop {
|
||||
match self.iter.next()? {
|
||||
(TokenType::Other, "\"") => self.state.swap(),
|
||||
(TokenType::Word, token) => {
|
||||
let token = match self.state {
|
||||
let token = tokens.next()?;
|
||||
match token.kind() {
|
||||
_ if token.text().trim() == "\"" => state.swap(),
|
||||
TokenKind::Word => {
|
||||
let token = match state {
|
||||
State::Quoted => QueryToken::Quoted(token),
|
||||
State::Free => QueryToken::Free(token),
|
||||
};
|
||||
return Some(token);
|
||||
},
|
||||
(_, _) => (),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
std::iter::from_fn(f)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use QueryToken::{Quoted, Free};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use fst::Set;
|
||||
|
||||
macro_rules! assert_eq_query_token {
|
||||
($test:expr, Quoted($val:literal)) => {
|
||||
match $test {
|
||||
Quoted(val) => assert_eq!(val.text(), $val),
|
||||
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
|
||||
}
|
||||
};
|
||||
|
||||
($test:expr, Free($val:literal)) => {
|
||||
match $test {
|
||||
Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()),
|
||||
Free(val) => assert_eq!(val.text(), $val),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty() {
|
||||
let mut iter = QueryTokens::new("");
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert!(iter.next().is_none());
|
||||
|
||||
let mut iter = QueryTokens::new(" ");
|
||||
assert_eq!(iter.next(), None);
|
||||
let query = " ";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_quoted_string() {
|
||||
let mut iter = QueryTokens::new("\"hello\"");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_pending_quoted_string() {
|
||||
let mut iter = QueryTokens::new("\"hello");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_non_quoted_string() {
|
||||
let mut iter = QueryTokens::new("hello");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quoted_directly_followed_by_free_strings() {
|
||||
let mut iter = QueryTokens::new("\"hello\"world");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello\"world";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_directly_followed_by_quoted_strings() {
|
||||
let mut iter = QueryTokens::new("hello\"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello\"world\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_followed_by_quoted_strings() {
|
||||
let mut iter = QueryTokens::new("hello \"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello \"world\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spaces_separated_strings() {
|
||||
let mut iter = QueryTokens::new("hello world ");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello world ";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_interleaved_quoted_free_strings() {
|
||||
let mut iter = QueryTokens::new("hello \"world\" coucou \"monde\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "hello \"world\" coucou \"monde\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_quoted_strings() {
|
||||
let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\"");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||
assert_eq!(iter.next(), Some(Quoted("est")));
|
||||
assert_eq!(iter.next(), Some(Quoted("beau")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "\"hello world\" coucou \"monde est beau\"";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("est"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Quoted("beau"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chinese() {
|
||||
let mut iter = QueryTokens::new("汽车男生");
|
||||
assert_eq!(iter.next(), Some(Free("汽车")));
|
||||
assert_eq!(iter.next(), Some(Free("男生")));
|
||||
assert_eq!(iter.next(), None);
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let query = "汽车男生";
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let mut iter = query_tokens(tokens);
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("汽车"));
|
||||
assert_eq_query_token!(iter.next().unwrap(), Free("男生"));
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,10 +4,11 @@ use std::fmt;
|
|||
use std::time::Instant;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use fst::{IntoStreamer, Streamer, Set};
|
||||
use levenshtein_automata::DFA;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
||||
use once_cell::sync::Lazy;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
@ -16,7 +17,7 @@ use crate::facet::FacetType;
|
|||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||
use crate::mdfs::Mdfs;
|
||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||
use crate::query_tokens::{query_tokens, QueryToken};
|
||||
use crate::{Index, FieldId, DocumentId, Criterion};
|
||||
|
||||
pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator};
|
||||
|
@ -68,14 +69,19 @@ impl<'a> Search<'a> {
|
|||
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
|
||||
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
|
||||
|
||||
let words: Vec<_> = QueryTokens::new(query).collect();
|
||||
let stop_words = Set::default();
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
||||
let analyzed = analyzer.analyze(query);
|
||||
let tokens = analyzed.tokens();
|
||||
let words: Vec<_> = query_tokens(tokens).collect();
|
||||
|
||||
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let number_of_words = words.len();
|
||||
|
||||
words.into_iter().enumerate().map(|(i, word)| {
|
||||
let (word, quoted) = match word {
|
||||
QueryToken::Free(word) => (word.to_lowercase(), word.len() <= 3),
|
||||
QueryToken::Quoted(word) => (word.to_lowercase(), true),
|
||||
QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3),
|
||||
QueryToken::Quoted(token) => (token.text().to_string(), true),
|
||||
};
|
||||
let is_last = i + 1 == number_of_words;
|
||||
let is_prefix = is_last && !ends_with_whitespace && !quoted;
|
||||
|
|
174
src/tokenizer.rs
174
src/tokenizer.rs
|
@ -1,174 +0,0 @@
|
|||
use std::{str, iter, mem};
|
||||
|
||||
use fst::raw::{Fst, Output};
|
||||
use once_cell::sync::Lazy;
|
||||
use slice_group_by::StrGroupBy;
|
||||
|
||||
use CharCategory::*;
|
||||
|
||||
const CHINESE_FST_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/chinese-words.fst"));
|
||||
static CHINESE_WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| Fst::new(CHINESE_FST_BYTES).unwrap());
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenType {
|
||||
Word,
|
||||
Space,
|
||||
Other,
|
||||
}
|
||||
|
||||
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||
text
|
||||
.linear_group_by_key(CharCategory::new)
|
||||
.flat_map(|mut string| {
|
||||
let first = string.chars().next().unwrap();
|
||||
let category = CharCategory::new(first);
|
||||
iter::from_fn(move || {
|
||||
if string.is_empty() { return None }
|
||||
match category {
|
||||
Chinese => {
|
||||
let fst = &CHINESE_WORDS_FST;
|
||||
match find_longest_prefix(fst, string.as_bytes()) {
|
||||
Some((_, l)) => {
|
||||
let s = &string[..l];
|
||||
string = &string[l..];
|
||||
Some((TokenType::Word, s))
|
||||
},
|
||||
None => {
|
||||
let first = string.chars().next().unwrap();
|
||||
let len = first.len_utf8();
|
||||
let (head, tail) = string.split_at(len);
|
||||
string = tail;
|
||||
Some((TokenType::Word, head))
|
||||
},
|
||||
}
|
||||
},
|
||||
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
|
||||
Space => Some((TokenType::Space, mem::take(&mut string))),
|
||||
Other => Some((TokenType::Other, mem::take(&mut string))),
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
|
||||
if t == TokenType::Word { Some(w) } else { None }
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
enum CharCategory {
|
||||
Chinese,
|
||||
Alphanumeric,
|
||||
Space,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl CharCategory {
|
||||
fn new(c: char) -> Self {
|
||||
if c.is_alphanumeric() {
|
||||
if is_chinese(c) { Chinese } else { Alphanumeric }
|
||||
} else if c.is_whitespace() { Space } else { Other }
|
||||
}
|
||||
}
|
||||
|
||||
fn is_chinese(c: char) -> bool {
|
||||
matches!(
|
||||
u32::from(c),
|
||||
0x4E00..=0x9FEF
|
||||
| 0x3400..=0x4DBF
|
||||
| 0x20000..=0x2A6DF
|
||||
| 0x2A700..=0x2B73F
|
||||
| 0x2B740..=0x2B81F
|
||||
| 0x2B820..=0x2CEAF
|
||||
| 0x2CEB0..=0x2EBEF
|
||||
| 0x3007..=0x3007
|
||||
)
|
||||
}
|
||||
|
||||
/// Find the longest key that is prefix of the given value.
|
||||
///
|
||||
/// If the key exists, then `Some((value, key_len))` is returned, where
|
||||
/// `value` is the value associated with the key, and `key_len` is the
|
||||
/// length of the found key. Otherwise `None` is returned.
|
||||
///
|
||||
/// This can be used to e.g. build tokenizing functions.
|
||||
// Copyright @llogiq
|
||||
// https://github.com/BurntSushi/fst/pull/104
|
||||
#[inline]
|
||||
fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
|
||||
let mut node = fst.root();
|
||||
let mut out = Output::zero();
|
||||
let mut last_match = None;
|
||||
for (i, &b) in value.iter().enumerate() {
|
||||
if let Some(trans_index) = node.find_input(b) {
|
||||
let t = node.transition(trans_index);
|
||||
node = fst.node(t.addr);
|
||||
out = out.cat(t.out);
|
||||
if node.is_final() {
|
||||
last_match = Some((out.cat(node.final_output()).value(), i + 1));
|
||||
}
|
||||
} else {
|
||||
return last_match;
|
||||
}
|
||||
}
|
||||
last_match
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn without_chinese() {
|
||||
let mut iter = simple_tokenizer("hello world!");
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "world")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Other, "!")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn only_chinese() {
|
||||
let mut iter = simple_tokenizer("今天的天气真好");
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixup_chinese_with_alphabet() {
|
||||
let mut iter = simple_tokenizer("今天的天气真好Apple is good今天的天气真好");
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "Apple")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "is")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "good")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_chinese() {
|
||||
let mut iter = simple_tokenizer("被虾头大讚好识𠱁女仔");
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "被")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "虾")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "头")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "大")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "讚")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "好")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "识")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "𠱁")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "女")));
|
||||
assert_eq!(iter.next(), Some((TokenType::Word, "仔")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
}
|
|
@ -370,6 +370,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
let readers = rayon::iter::repeatn(documents, num_threads)
|
||||
.enumerate()
|
||||
.map(|(i, documents)| {
|
||||
let stop_words = fst::Set::default();
|
||||
let store = Store::new(
|
||||
searchable_fields.clone(),
|
||||
faceted_fields.clone(),
|
||||
|
@ -379,6 +380,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
&stop_words,
|
||||
)?;
|
||||
store.index(
|
||||
documents,
|
||||
|
|
|
@ -16,12 +16,13 @@ use ordered_float::OrderedFloat;
|
|||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use tempfile::tempfile;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind};
|
||||
use fst::Set;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
|
||||
|
||||
|
@ -47,7 +48,7 @@ pub struct Readers {
|
|||
pub documents: Reader<FileFuse>,
|
||||
}
|
||||
|
||||
pub struct Store {
|
||||
pub struct Store<'s, A> {
|
||||
// Indexing parameters
|
||||
searchable_fields: HashSet<FieldId>,
|
||||
faceted_fields: HashMap<FieldId, FacetType>,
|
||||
|
@ -71,9 +72,11 @@ pub struct Store {
|
|||
// MTBL writers
|
||||
docid_word_positions_writer: Writer<File>,
|
||||
documents_writer: Writer<File>,
|
||||
// tokenizer
|
||||
analyzer: Analyzer<'s, A>,
|
||||
}
|
||||
|
||||
impl Store {
|
||||
impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
pub fn new(
|
||||
searchable_fields: HashSet<FieldId>,
|
||||
faceted_fields: HashMap<FieldId, FacetType>,
|
||||
|
@ -83,7 +86,8 @@ impl Store {
|
|||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
) -> anyhow::Result<Store>
|
||||
stop_words: &'s Set<A>,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
// We divide the max memory by the number of sorter the Store have.
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
|
||||
|
@ -137,6 +141,8 @@ impl Store {
|
|||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
||||
|
||||
Ok(Store {
|
||||
// Indexing parameters.
|
||||
searchable_fields,
|
||||
|
@ -161,6 +167,8 @@ impl Store {
|
|||
// MTBL writers
|
||||
docid_word_positions_writer,
|
||||
documents_writer,
|
||||
//tokenizer
|
||||
analyzer,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -462,9 +470,13 @@ impl Store {
|
|||
None => continue,
|
||||
};
|
||||
|
||||
let tokens = simple_tokenizer(&content).filter_map(only_token);
|
||||
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
|
||||
let word = token.to_lowercase();
|
||||
let analyzed = self.analyzer.analyze(&content);
|
||||
let tokens = analyzed
|
||||
.tokens()
|
||||
.filter(|t| t.is_word())
|
||||
.map(|t| t.text().to_string());
|
||||
|
||||
for (pos, word) in tokens.enumerate().take(MAX_POSITION) {
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue