1091: New tokenizer r=LegendreM a=MarinPostma

Integration of the new tokenizer to meilisearch.

- Tokenize and normalizes the query string for better search results
- Language sensitive tokenization and normalization during indexation
- better support for Chinese thanks to jieba (when Chinese characters are detected)

To do in a later PR:
- Use a common tokenization instance
- use tokenization for synonyms

close #624

Co-authored-by: mpostma <postma.marin@protonmail.com>
Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
bors[bot] 2021-01-06 08:47:53 +00:00 committed by GitHub
commit 08a8dc0d0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 379 additions and 133 deletions

123
Cargo.lock generated
View File

@ -299,6 +299,12 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e"
[[package]]
name = "ahash"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
[[package]]
name = "ahash"
version = "0.4.7"
@ -570,6 +576,15 @@ version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48"
[[package]]
name = "cedarwood"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d"
dependencies = [
"smallvec",
]
[[package]]
name = "cfg-if"
version = "0.1.10"
@ -582,6 +597,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "character_converter"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
dependencies = [
"bincode",
]
[[package]]
name = "chrono"
version = "0.4.19"
@ -1183,13 +1207,23 @@ version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177"
[[package]]
name = "hashbrown"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf"
dependencies = [
"ahash 0.3.8",
"autocfg",
]
[[package]]
name = "hashbrown"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
dependencies = [
"ahash",
"ahash 0.4.7",
"serde",
]
@ -1380,7 +1414,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b"
dependencies = [
"autocfg",
"hashbrown",
"hashbrown 0.9.1",
"serde",
]
@ -1474,6 +1508,21 @@ dependencies = [
"libc",
]
[[package]]
name = "jieba-rs"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.9.1",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]]
name = "js-sys"
version = "0.3.46"
@ -1606,7 +1655,7 @@ dependencies = [
"either",
"env_logger 0.8.2",
"fst",
"hashbrown",
"hashbrown 0.9.1",
"heed",
"indexmap",
"intervaltree",
@ -1666,7 +1715,6 @@ dependencies = [
"meilisearch-core",
"meilisearch-error",
"meilisearch-schema",
"meilisearch-tokenizer",
"mime",
"once_cell",
"rand 0.8.1",
@ -1704,10 +1752,18 @@ dependencies = [
[[package]]
name = "meilisearch-tokenizer"
version = "0.17.0"
version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.2#8d91cd52f30aa4b651a085c15056938f7b599646"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]]
@ -1985,6 +2041,44 @@ dependencies = [
"sha-1 0.8.2",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.7.3",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "0.4.27"
@ -2151,6 +2245,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg",
]
[[package]]
@ -2236,6 +2331,15 @@ dependencies = [
"rand_core 0.6.1",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rand_xoshiro"
version = "0.4.0"
@ -3421,6 +3525,15 @@ dependencies = [
"webpki",
]
[[package]]
name = "whatlang"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075"
dependencies = [
"hashbrown 0.7.2",
]
[[package]]
name = "whoami"
version = "1.0.3"

View File

@ -3,7 +3,6 @@ members = [
"meilisearch-core",
"meilisearch-http",
"meilisearch-schema",
"meilisearch-tokenizer",
"meilisearch-types",
]

View File

@ -26,7 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
log = "0.4.11"
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.17.0" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.2" }
meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" }
once_cell = "1.5.2"
ordered-float = { version = "2.0.1", features = ["serde"] }

View File

@ -1,15 +1,4 @@
mod dfa;
use meilisearch_tokenizer::is_cjk;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}

View File

@ -193,8 +193,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> {
Err(Error::VersionMismatch(format!("{}.{}.XX", version_major, version_minor)))
} else {
Ok((
version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?
))
}
@ -212,8 +212,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> {
current_version_patch).as_bytes())?;
Ok((
current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?,
current_version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?
))
} else {

View File

@ -296,7 +296,6 @@ mod tests {
use sdset::SetBuf;
use tempfile::TempDir;
use crate::automaton::normalize_str;
use crate::bucket_sort::SimpleMatch;
use crate::database::{Database, DatabaseOptions};
use crate::store::Index;
@ -304,6 +303,35 @@ mod tests {
use crate::Document;
use meilisearch_schema::Schema;
fn is_cjk(c: char) -> bool {
('\u{1100}'..'\u{11ff}').contains(&c) // Hangul Jamo
|| ('\u{2e80}'..'\u{2eff}').contains(&c) // CJK Radicals Supplement
|| ('\u{2f00}'..'\u{2fdf}').contains(&c) // Kangxi radical
|| ('\u{3000}'..'\u{303f}').contains(&c) // Japanese-style punctuation
|| ('\u{3040}'..'\u{309f}').contains(&c) // Japanese Hiragana
|| ('\u{30a0}'..'\u{30ff}').contains(&c) // Japanese Katakana
|| ('\u{3100}'..'\u{312f}').contains(&c)
|| ('\u{3130}'..'\u{318F}').contains(&c) // Hangul Compatibility Jamo
|| ('\u{3200}'..'\u{32ff}').contains(&c) // Enclosed CJK Letters and Months
|| ('\u{3400}'..'\u{4dbf}').contains(&c) // CJK Unified Ideographs Extension A
|| ('\u{4e00}'..'\u{9fff}').contains(&c) // CJK Unified Ideographs
|| ('\u{a960}'..'\u{a97f}').contains(&c) // Hangul Jamo Extended-A
|| ('\u{ac00}'..'\u{d7a3}').contains(&c) // Hangul Syllables
|| ('\u{d7b0}'..'\u{d7ff}').contains(&c) // Hangul Jamo Extended-B
|| ('\u{f900}'..'\u{faff}').contains(&c) // CJK Compatibility Ideographs
|| ('\u{ff00}'..'\u{ffef}').contains(&c) // Full-width roman characters and half-width katakana
}
fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}
fn set_from_stream<'f, I, S>(stream: I) -> fst::Set<Vec<u8>>
where
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
@ -415,6 +443,7 @@ mod tests {
let mut final_indexes = Vec::new();
for index in indexes {
let name = index.attribute.to_string();
schema.insert(&name).unwrap();
let indexed_pos = schema.insert_with_position(&name).unwrap().1;
let index = DocIndex {
attribute: indexed_pos.0,
@ -446,7 +475,7 @@ mod tests {
.postings_lists
.put_postings_list(&mut writer, &word, &postings_list)
.unwrap();
}
}
for ((docid, attr, _), count) in fields_counts {
let prev = index
@ -460,7 +489,7 @@ mod tests {
.documents_fields_counts
.put_document_field_count(&mut writer, docid, IndexedPos(attr), prev + count)
.unwrap();
}
}
writer.commit().unwrap();
@ -1268,15 +1297,15 @@ mod tests {
let builder = store.query_builder();
let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap();
let mut iter = documents.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);

View File

@ -7,13 +7,13 @@ use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use meilisearch_tokenizer::split_query_string;
use sdset::{Set, SetBuf, SetOperation};
use log::debug;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation};
use crate::database::MainT;
use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::QueryWordsMapper;
#[derive(Clone, PartialEq, Eq, Hash)]
@ -146,7 +146,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
}
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
let words = normalize_str(&words.join(" "));
let words = &words.join(" ");
let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?;
let mut strings = Vec::new();
@ -174,15 +174,25 @@ where I: IntoIterator<Item=Operation>,
const MAX_NGRAM: usize = 3;
fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set<A>) -> Vec<(usize, String)> {
// TODO: Use global instance instead
Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words))
.analyze(s)
.tokens()
.filter(|t| t.is_word())
.map(|t| t.word.to_string())
.enumerate()
.collect()
}
pub fn create_query_tree(
reader: &heed::RoTxn<MainT>,
ctx: &Context,
query: &str,
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{
let words = split_query_string(query).map(str::to_lowercase);
let words = words.filter(|w| !ctx.stop_words.contains(w));
let words: Vec<_> = words.enumerate().collect();
// TODO: use a shared analyzer instance
let words = split_query_string(query, &ctx.stop_words);
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));

View File

@ -2,9 +2,9 @@ use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use deunicode::deunicode_with_tofu;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind};
use sdset::SetBuf;
use crate::{DocIndex, DocumentId};
@ -14,11 +14,11 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<A> {
pub struct RawIndexer<'a, A> {
word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set<A>,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
analyzer: Analyzer<'a, A>,
}
pub struct Indexed<'a> {
@ -26,32 +26,35 @@ pub struct Indexed<'a> {
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
}
impl<A> RawIndexer<A> {
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
impl<'a, A> RawIndexer<'a, A>
where
A: AsRef<[u8]>
{
pub fn new(stop_words: &'a fst::Set<A>) -> RawIndexer<'a, A> {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
pub fn with_word_limit(stop_words: &'a fst::Set<A>, limit: usize) -> RawIndexer<A> {
RawIndexer {
word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)),
}
}
}
impl<A: AsRef<[u8]>> RawIndexer<A> {
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
let mut number_of_words = 0;
for token in Tokenizer::new(text) {
let analyzed_text = self.analyzer.analyze(text);
for (token_pos, (word_pos, token)) in process_tokens(analyzed_text.tokens()).enumerate() {
let must_continue = index_token(
token,
word_pos,
token_pos,
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -66,24 +69,37 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
number_of_words
}
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I)
where
I: IntoIterator<Item = &'s str>,
{
let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
let mut word_offset = 0;
if !must_continue {
break;
for text in text_iter.into_iter() {
let current_word_offset = word_offset;
let analyzed_text = self.analyzer.analyze(text);
let tokens = process_tokens(analyzed_text.tokens())
.map(|(i, t)| (i + current_word_offset, t))
.enumerate();
for (token_pos, (word_pos, token)) in tokens {
word_offset = word_pos + 1;
let must_continue = index_token(
token,
word_pos,
token_pos,
id,
indexed_pos,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
}
}
}
@ -113,31 +129,53 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
}
}
fn index_token<A>(
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| !token.is_word())
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
}
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
}
#[allow(clippy::too_many_arguments)]
fn index_token(
token: Token,
word_pos: usize,
token_pos: usize,
id: DocumentId,
indexed_pos: IndexedPos,
word_limit: usize,
stop_words: &fst::Set<A>,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
where A: AsRef<[u8]>,
{
if token.index >= word_limit {
if token_pos >= word_limit {
return false;
}
let lower = token.word.to_lowercase();
let token = Token {
word: &lower,
..token
};
if !stop_words.contains(&token.word) {
match token_to_docindex(id, indexed_pos, token) {
if !token.is_stopword() {
match token_to_docindex(id, indexed_pos, &token, word_pos) {
Some(docindex) => {
let word = Vec::from(token.word);
let word = Vec::from(token.word.as_ref());
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
@ -145,20 +183,6 @@ where A: AsRef<[u8]>,
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower && !unidecoded.is_empty() {
let word = Vec::from(unidecoded);
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
}
}
}
}
None => return false,
@ -168,10 +192,10 @@ where A: AsRef<[u8]>,
true
}
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option<DocIndex> {
let word_index = u16::try_from(word_index).ok()?;
let char_index = u16::try_from(token.byte_start).ok()?;
let char_length = u16::try_from(token.word.len()).ok()?;
let docindex = DocIndex {
document_id: id,
@ -188,10 +212,23 @@ fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> O
mod tests {
use super::*;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
#[test]
fn test_process_token() {
let text = " 為一包含一千多萬目詞的帶標記平衡語料庫";
let stopwords = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords));
let analyzer = analyzer.analyze(text);
let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect();
assert_eq!(tokens, ["", "", "包含", "一千多万", "目词", "", "", "标记", "平衡", "语料库"]);
}
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -206,14 +243,12 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -228,9 +263,6 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -238,7 +270,7 @@ mod tests {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -255,14 +287,12 @@ mod tests {
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -281,7 +311,8 @@ mod tests {
#[test]
// test sample from 807
fn very_long_text() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means &ldquo;requests for example.com/&rdquo;, while '/admin' means &ldquo;requests for example.com/admin&rdquo;. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:&#39;/&#39;:# Rules for all requests that don&#39;t otherwise match....&#39;/sites/default/files&#39;:# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:&#39;/&#39;:passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:&#39;/&#39;:root:&#39;web&#39;passthru:&#39;/app.php&#39;This block will serve requests to / from the web directory in the application, and if a file doesn&rsquo;t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with &ldquo;ms&rdquo; (milliseconds), &ldquo;s&rdquo; (seconds), &ldquo;m&rdquo; (minutes), &ldquo;h&rdquo; (hours), &ldquo;d&rdquo; (days), &ldquo;w&rdquo; (weeks), &ldquo;M&rdquo; (months, 30d) or &ldquo;y&rdquo; (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don&rsquo;t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren&rsquo;t affected, to avoid overlap with the application&rsquo;s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:&#39;/&#39;:passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application&rsquo;s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n ";
@ -289,12 +320,13 @@ mod tests {
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes.get(&"request".to_owned().into_bytes()).is_some());
}
#[test]
fn words_over_index_1000_not_indexed() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let mut text = String::with_capacity(5000);

View File

@ -110,7 +110,7 @@ pub fn push_documents_addition<D: serde::Serialize>(
}
#[allow(clippy::too_many_arguments)]
fn index_document<A>(
fn index_document<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
documents_fields: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
@ -121,7 +121,6 @@ fn index_document<A>(
document_id: DocumentId,
value: &Value,
) -> MResult<()>
where A: AsRef<[u8]>,
{
let serialized = serde_json::to_vec(value)?;
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
@ -222,7 +221,7 @@ pub fn apply_addition(
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
// For each document in this update
for (document_id, document) in &documents_additions {
@ -317,7 +316,7 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
.unwrap();
let number_of_inserted_documents = documents_ids_to_reindex.len();
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
let mut ram_store = HashMap::new();
if let Some(ref attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
Ok(())
}
pub fn write_documents_addition_index<A>(
pub fn write_documents_addition_index<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer<A>,
) -> MResult<()>
where A: AsRef<[u8]>,
{
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();

View File

@ -12,13 +12,12 @@ use crate::serde::SerializerError;
use crate::store::DiscoverIds;
/// Returns the number of words indexed or `None` if the type is unindexable.
pub fn index_value<A>(
pub fn index_value<A: AsRef<[u8]>>(
indexer: &mut RawIndexer<A>,
document_id: DocumentId,
indexed_pos: IndexedPos,
value: &Value,
) -> Option<usize>
where A: AsRef<[u8]>,
{
match value {
Value::Null => None,

View File

@ -35,7 +35,6 @@ main_error = "0.1.1"
meilisearch-core = { path = "../meilisearch-core", version = "0.17.0" }
meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" }
meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" }
meilisearch-tokenizer = {path = "../meilisearch-tokenizer", version = "0.17.0"}
mime = "0.3.16"
once_cell = "1.5.2"
rand = "0.8.1"

View File

@ -11,7 +11,6 @@ use meilisearch_core::criterion::*;
use meilisearch_core::settings::RankingRule;
use meilisearch_core::{Highlight, Index, RankedMap};
use meilisearch_schema::{FieldId, Schema};
use meilisearch_tokenizer::is_cjk;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
@ -344,7 +343,7 @@ pub struct SearchResult {
/// returns the start index and the length on the crop.
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c);
let word_end_index = |mut index| {
if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
@ -480,7 +479,7 @@ fn calculate_highlights(
for (attribute, matches) in matches.iter() {
if attributes_to_highlight.contains(attribute) {
if let Some(Value::String(value)) = document.get(attribute) {
let value: Vec<_> = value.chars().collect();
let value = value;
let mut highlighted_value = String::new();
let mut index = 0;
@ -493,16 +492,16 @@ fn calculate_highlights(
let before = value.get(index..m.start);
let highlighted = value.get(m.start..(m.start + m.length));
if let (Some(before), Some(highlighted)) = (before, highlighted) {
highlighted_value.extend(before);
highlighted_value.push_str(before);
highlighted_value.push_str("<em>");
highlighted_value.extend(highlighted);
highlighted_value.push_str(highlighted);
highlighted_value.push_str("</em>");
index = m.start + m.length;
} else {
error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
}
}
highlighted_value.extend(value[index..].iter());
highlighted_value.push_str(&value[index..]);
highlight_result.insert(attribute.to_string(), Value::String(highlighted_value));
};
}
@ -598,7 +597,7 @@ mod tests {
let mut m = Vec::new();
m.push(MatchPosition {
start: 510,
start: 529,
length: 9,
});
matches.insert("description".to_string(), m);

View File

@ -5,3 +5,22 @@ pub mod compression;
pub use authentication::Authentication;
pub use normalize_path::NormalizePath;
pub fn is_cjk(c: char) -> bool {
('\u{1100}'..'\u{11ff}').contains(&c) // Hangul Jamo
|| ('\u{2e80}'..'\u{2eff}').contains(&c) // CJK Radicals Supplement
|| ('\u{2f00}'..'\u{2fdf}').contains(&c) // Kangxi radical
|| ('\u{3000}'..'\u{303f}').contains(&c) // Japanese-style punctuation
|| ('\u{3040}'..'\u{309f}').contains(&c) // Japanese Hiragana
|| ('\u{30a0}'..'\u{30ff}').contains(&c) // Japanese Katakana
|| ('\u{3100}'..'\u{312f}').contains(&c)
|| ('\u{3130}'..'\u{318F}').contains(&c) // Hangul Compatibility Jamo
|| ('\u{3200}'..'\u{32ff}').contains(&c) // Enclosed CJK Letters and Months
|| ('\u{3400}'..'\u{4dbf}').contains(&c) // CJK Unified Ideographs Extension A
|| ('\u{4e00}'..'\u{9fff}').contains(&c) // CJK Unified Ideographs
|| ('\u{a960}'..'\u{a97f}').contains(&c) // Hangul Jamo Extended-A
|| ('\u{ac00}'..'\u{d7a3}').contains(&c) // Hangul Syllables
|| ('\u{d7b0}'..'\u{d7ff}').contains(&c) // Hangul Jamo Extended-B
|| ('\u{f900}'..'\u{faff}').contains(&c) // CJK Compatibility Ideographs
|| ('\u{ff00}'..'\u{ffef}').contains(&c) // Full-width roman characters and half-width katakana
}

View File

@ -74,4 +74,4 @@
{"id":73,"isActive":false,"balance":"$1,239.74","picture":"http://placehold.it/32x32","age":38,"color":"blue","name":"Eleanor Shepherd","gender":"female","email":"eleanorshepherd@chorizon.com","phone":"+1 (894) 567-2617","address":"670 Lafayette Walk, Darlington, Palau, 8803","about":"Adipisicing ad incididunt id veniam magna cupidatat et labore eu deserunt mollit. Lorem voluptate exercitation elit eu aliquip cupidatat occaecat anim excepteur reprehenderit est est. Ipsum excepteur ea mollit qui nisi laboris ex qui. Cillum velit culpa culpa commodo laboris nisi Lorem non elit deserunt incididunt. Officia quis velit nulla sint incididunt duis mollit tempor adipisicing qui officia eu nisi Lorem. Do proident pariatur ex enim nostrud eu aute esse deserunt eu velit quis culpa exercitation. Occaecat ad cupidatat ullamco consequat duis anim deserunt occaecat aliqua sunt consectetur ipsum magna.\r\n","registered":"2020-02-29T12:15:28 -01:00","latitude":35.749621,"longitude":-94.40842,"tags":["good first issue","new issue","new issue","bug"]}
{"id":74,"isActive":true,"balance":"$1,180.90","picture":"http://placehold.it/32x32","age":36,"color":"Green","name":"Stark Wong","gender":"male","email":"starkwong@chorizon.com","phone":"+1 (805) 575-3055","address":"522 Bond Street, Bawcomville, Wisconsin, 324","about":"Aute qui sit incididunt eu adipisicing exercitation sunt nostrud. Id laborum incididunt proident ipsum est cillum esse. Officia ullamco eu ut Lorem do minim ea dolor consequat sit eu est voluptate. Id commodo cillum enim culpa aliquip ullamco nisi Lorem cillum ipsum cupidatat anim officia eu. Dolore sint elit labore pariatur. Officia duis nulla voluptate et nulla ut voluptate laboris eu commodo veniam qui veniam.\r\n","registered":"2020-01-25T10:47:48 -01:00","latitude":-80.452139,"longitude":160.72546,"tags":["wontfix"]}
{"id":75,"isActive":false,"balance":"$1,913.42","picture":"http://placehold.it/32x32","age":24,"color":"Green","name":"Emma Jacobs","gender":"female","email":"emmajacobs@chorizon.com","phone":"+1 (899) 554-3847","address":"173 Tapscott Street, Esmont, Maine, 7450","about":"Laboris consequat consectetur tempor labore ullamco ullamco voluptate quis quis duis ut ad. In est irure quis amet sunt nulla ad ut sit labore ut eu quis duis. Nostrud cupidatat aliqua sunt occaecat minim id consequat officia deserunt laborum. Ea dolor reprehenderit laborum veniam exercitation est nostrud excepteur laborum minim id qui et.\r\n","registered":"2019-03-29T06:24:13 -01:00","latitude":-35.53722,"longitude":155.703874,"tags":[]}
{"id":76,"isActive":false,"balance":"$1,274.29","picture":"http://placehold.it/32x32","age":25,"color":"Green","name":"Clarice Gardner","gender":"female","email":"claricegardner@chorizon.com","phone":"+1 (810) 407-3258","address":"894 Brooklyn Road, Utting, New Hampshire, 6404","about":"Elit occaecat aute ea adipisicing mollit cupidatat aliquip excepteur veniam minim. Sunt quis dolore in commodo aute esse quis. Lorem in cillum commodo eu anim commodo mollit. Adipisicing enim sunt adipisicing cupidatat adipisicing eiusmod eu do sit nisi.\r\n","registered":"2014-10-20T10:13:32 -02:00","latitude":17.11935,"longitude":65.38197,"tags":["new issue","wontfix"]}
{"id":77,"isActive":false,"balance":"$1,274.29","picture":"http://placehold.it/32x32","age":25,"color":"Red","name":"孫武","gender":"male","email":"SunTzu@chorizon.com","phone":"+1 (810) 407-3258","address":"吴國","about":"孫武前544年前470年或前496年字長卿春秋時期齊國人著名軍事家、政治家兵家代表人物。兵書《孫子兵法》的作者後人尊稱為孫子、兵聖、東方兵聖山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖次子孙明为富春孫氏始祖。\r\n","registered":"2014-10-20T10:13:32 -02:00","latitude":17.11935,"longitude":65.38197,"tags":["new issue","wontfix"]}

View File

@ -1590,18 +1590,18 @@
"tags": []
},
{
"id": 76,
"id": 77,
"isActive": false,
"balance": "$1,274.29",
"picture": "http://placehold.it/32x32",
"age": 25,
"color": "Green",
"name": "Clarice Gardner",
"gender": "female",
"email": "claricegardner@chorizon.com",
"color": "Red",
"name": "孫武",
"gender": "male",
"email": "SunTzu@chorizon.com",
"phone": "+1 (810) 407-3258",
"address": "894 Brooklyn Road, Utting, New Hampshire, 6404",
"about": "Elit occaecat aute ea adipisicing mollit cupidatat aliquip excepteur veniam minim. Sunt quis dolore in commodo aute esse quis. Lorem in cillum commodo eu anim commodo mollit. Adipisicing enim sunt adipisicing cupidatat adipisicing eiusmod eu do sit nisi.\r\n",
"address": "吴國",
"about": "孫武前544年前470年或前496年字長卿春秋時期齊國人著名軍事家、政治家兵家代表人物。兵書《孫子兵法》的作者後人尊稱為孫子、兵聖、東方兵聖山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖次子孙明为富春孫氏始祖。\r\n",
"registered": "2014-10-20T10:13:32 -02:00",
"latitude": 17.11935,
"longitude": 65.38197,

View File

@ -358,6 +358,66 @@ async fn search_with_attribute_to_highlight_wildcard() {
});
}
#[actix_rt::test]
async fn search_with_attribute_to_highlight_wildcard_chinese() {
let mut server = common::Server::test_server().await;
let query = json!({
"q": "子孙",
"limit": 1,
"attributesToHighlight": ["*"]
});
let expected = json!([
{
"id": 77,
"isActive": false,
"balance": "$1,274.29",
"picture": "http://placehold.it/32x32",
"age": 25,
"color": "Red",
"name": "孫武",
"gender": "male",
"email": "SunTzu@chorizon.com",
"phone": "+1 (810) 407-3258",
"address": "吴國",
"about": "孫武前544年前470年或前496年字長卿春秋時期齊國人著名軍事家、政治家兵家代表人物。兵書《孫子兵法》的作者後人尊稱為孫子、兵聖、東方兵聖山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖次子孙明为富春孫氏始祖。\r\n",
"registered": "2014-10-20T10:13:32 -02:00",
"latitude": 17.11935,
"longitude": 65.38197,
"tags": [
"new issue",
"wontfix"
],
"_formatted": {
"id": 77,
"isActive": false,
"balance": "$1,274.29",
"picture": "http://placehold.it/32x32",
"age": 25,
"color": "Red",
"name": "<em>孫武</em>",
"gender": "male",
"email": "SunTzu@chorizon.com",
"phone": "+1 (810) 407-3258",
"address": "吴國",
"about": "<em>孫武</em>前544年前470年或前496年字長卿春秋時期齊國人著名軍事家、政治家兵家代表人物。兵書《<em>孫子</em>兵法》的作者,後人尊稱為<em>孫子</em>、兵聖、東方兵聖,山東、蘇州等地尚有祀奉<em>孫武</em>的廟宇兵聖廟。其族人为樂安<em>孫氏</em>始祖,次<em>子孙</em>明为富春孫氏始祖。\r\n",
"registered": "2014-10-20T10:13:32 -02:00",
"latitude": 17.11935,
"longitude": 65.38197,
"tags": [
"new issue",
"wontfix"
]
}
}
]);
test_post_get_search!(server, query, |response, _status_code| {
assert_json_eq!(expected.clone(), response["hits"].clone(), ordered: false);
});
}
#[actix_rt::test]
async fn search_with_attribute_to_highlight_1() {
let mut server = common::Server::test_server().await;