From 8a4d05b7bbb033e0a8f6bd93d89f92450d357c55 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 19 Nov 2020 16:00:14 +0100 Subject: [PATCH 01/22] remove meilisearch tokenizer --- Cargo.lock | 80 +++++++++++++++------------ Cargo.toml | 1 - meilisearch-core/Cargo.toml | 1 - meilisearch-core/src/automaton/mod.rs | 1 - meilisearch-core/src/raw_indexer.rs | 1 - meilisearch-http/Cargo.toml | 1 - 6 files changed, 44 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 77b660188..f48920953 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -210,7 +210,7 @@ dependencies = [ "rustls 0.18.1", "tokio-rustls", "webpki", - "webpki-roots 0.20.0", + "webpki-roots", ] [[package]] @@ -332,7 +332,7 @@ checksum = "d4d7d63395147b81a9e570bcc6243aaf71c017bd666d4909cfef0085bdda8d73" [[package]] name = "assert-json-diff" version = "1.0.1" -source = "git+https://github.com/qdequele/assert-json-diff#9012a0c8866d0f2db0ef9a6242e4a19d1e8c67e4" +source = "git+https://github.com/qdequele/assert-json-diff?branch=master#9012a0c8866d0f2db0ef9a6242e4a19d1e8c67e4" dependencies = [ "serde", "serde_json", @@ -383,7 +383,7 @@ dependencies = [ "actix-rt", "actix-service", "base64 0.13.0", - "bytes 0.5.6", + "bytes", "cfg-if 1.0.0", "derive_more", "futures-core", @@ -790,6 +790,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "data-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993a608597367c6377b258c25d7120740f00ed23a2252b729b1932dd7866f908" + [[package]] name = "debugid" version = "0.7.2" @@ -962,16 +968,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi 0.3.9", -] - [[package]] name = "fs_extra" version = "1.2.0" @@ -1229,9 +1225,9 @@ checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" [[package]] name = "heed-types" -version = "0.7.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" +checksum = "72fc61caee13e85ea330eabf0c6c7098c511ff173bcb57a760b1eda3bba9f6eb" dependencies = [ "bincode", "heed-traits", @@ -1516,9 +1512,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.81" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb" +checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" [[package]] name = "linked-hash-map" @@ -1616,7 +1612,6 @@ dependencies = [ "log", "meilisearch-error", "meilisearch-schema", - "meilisearch-tokenizer", "meilisearch-types", "once_cell", "ordered-float", @@ -1666,7 +1661,6 @@ dependencies = [ "meilisearch-core", "meilisearch-error", "meilisearch-schema", - "meilisearch-tokenizer", "mime", "once_cell", "rand 0.8.1", @@ -1702,14 +1696,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "meilisearch-tokenizer" -version = "0.17.0" -dependencies = [ - "deunicode", - "slice-group-by", -] - [[package]] name = "meilisearch-types" version = "0.17.0" @@ -2353,6 +2339,7 @@ dependencies = [ "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-bindgen-test", "web-sys", "webpki-roots 0.20.0", "winreg 0.7.0", @@ -2459,6 +2446,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + [[package]] name = "scopeguard" version = "1.1.0" @@ -3388,6 +3381,30 @@ name = "web-sys" version = "0.3.46" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222b1ef9334f92a21d3fb53dc3fd80f30836959a90f9274a626d7e06315ba3c3" +dependencies = [ + "console_error_panic_hook", + "js-sys", + "scoped-tls", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-bindgen-test-macro", +] + +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "webpki-roots" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f20dea7535251981a9670857150d571846545088359b28e4951d350bdaf179f" dependencies = [ "js-sys", "wasm-bindgen", @@ -3403,15 +3420,6 @@ dependencies = [ "untrusted", ] -[[package]] -name = "webpki-roots" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f20dea7535251981a9670857150d571846545088359b28e4951d350bdaf179f" -dependencies = [ - "webpki", -] - [[package]] name = "webpki-roots" version = "0.21.0" diff --git a/Cargo.toml b/Cargo.toml index 9356916b6..913ab34c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,6 @@ members = [ "meilisearch-core", "meilisearch-http", "meilisearch-schema", - "meilisearch-tokenizer", "meilisearch-types", ] diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 8687c7814..dbd369000 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,7 +26,6 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.17.0" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index e7cb9733b..c47645041 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,6 +1,5 @@ mod dfa; -use meilisearch_tokenizer::is_cjk; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 89c62a3d4..471d0cfff 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -4,7 +4,6 @@ use std::convert::TryFrom; use deunicode::deunicode_with_tofu; use meilisearch_schema::IndexedPos; -use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer}; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index 30dc4f7d9..6c066e393 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -35,7 +35,6 @@ main_error = "0.1.1" meilisearch-core = { path = "../meilisearch-core", version = "0.17.0" } meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = {path = "../meilisearch-tokenizer", version = "0.17.0"} mime = "0.3.16" once_cell = "1.5.2" rand = "0.8.1" From 5e008420872a83cb27b042bb7dc64279408c03a2 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 19 Nov 2020 18:23:08 +0100 Subject: [PATCH 02/22] integration with new tokenizer wip --- meilisearch-core/src/automaton/mod.rs | 10 --- meilisearch-core/src/query_tree.rs | 25 ++++-- meilisearch-core/src/raw_indexer.rs | 97 ++++++++++++--------- meilisearch-http/src/helpers/meilisearch.rs | 3 +- meilisearch-http/src/helpers/mod.rs | 19 ++++ 5 files changed, 94 insertions(+), 60 deletions(-) diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index c47645041..27b63f25c 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -2,13 +2,3 @@ mod dfa; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; - -pub fn normalize_str(string: &str) -> String { - let mut string = string.to_lowercase(); - - if !string.contains(is_cjk) { - string = deunicode::deunicode_with_tofu(&string, ""); - } - - string -} diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 4a3a622b2..4b4772036 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -7,13 +7,14 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; -use meilisearch_tokenizer::split_query_string; -use sdset::{Set, SetBuf, SetOperation}; use log::debug; +use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use sdset::{Set, SetBuf, SetOperation}; use crate::database::MainT; use crate::{store, DocumentId, DocIndex, MResult, FstSetCow}; -use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa}; +use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::QueryWordsMapper; #[derive(Clone, PartialEq, Eq, Hash)] @@ -146,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn, ctx: &Context, word: &' } fn fetch_synonyms(reader: &heed::RoTxn, ctx: &Context, words: &[&str]) -> MResult>> { - let words = normalize_str(&words.join(" ")); + let words = &words.join(" "); let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?; let mut strings = Vec::new(); @@ -174,15 +175,25 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; +fn split_query_string(s: &str) -> Vec<(usize, String)> { + // TODO: Use global instance instead + let analyzer = Analyzer::new(AnalyzerConfig::default()); + analyzer + .analyze(s) + .tokens() + .filter(|t| !t.is_stopword()) + .enumerate() + .map(|(i, Token { word, .. })| (i, word.to_string())) + .collect() +} + pub fn create_query_tree( reader: &heed::RoTxn, ctx: &Context, query: &str, ) -> MResult<(Operation, HashMap>)> { - let words = split_query_string(query).map(str::to_lowercase); - let words = words.filter(|w| !ctx.stop_words.contains(w)); - let words: Vec<_> = words.enumerate().collect(); + let words = split_query_string(query); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 471d0cfff..e234ca736 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -2,8 +2,9 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; -use deunicode::deunicode_with_tofu; use meilisearch_schema::IndexedPos; +use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::Token; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -18,6 +19,7 @@ pub struct RawIndexer { stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, + analyzer: Analyzer, } pub struct Indexed<'a> { @@ -36,6 +38,7 @@ impl RawIndexer { stop_words, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), + analyzer: Analyzer::new(AnalyzerConfig::default()), } } } @@ -44,9 +47,12 @@ impl> RawIndexer { pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; - for token in Tokenizer::new(text) { + let analyzed_text = self.analyzer.analyze(text); + for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() { let must_continue = index_token( token, + token_index, + word_pos, id, indexed_pos, self.word_limit, @@ -69,20 +75,47 @@ impl> RawIndexer { where I: IntoIterator, { - let iter = iter.into_iter(); - for token in SeqTokenizer::new(iter) { - let must_continue = index_token( - token, - id, - indexed_pos, - self.word_limit, - &self.stop_words, - &mut self.words_doc_indexes, - &mut self.docs_words, - ); + let mut token_index_offset = 0; + let mut byte_offset = 0; + let mut word_offset = 0; - if !must_continue { - break; + for s in iter.into_iter() { + let current_token_index_offset = token_index_offset; + let current_byte_offset = byte_offset; + let current_word_offset = word_offset; + + let analyzed_text = self.analyzer.analyze(s); + let tokens = analyzed_text + .tokens() + .enumerate() + .map(|(i, mut t)| { + t.byte_start = t.byte_start + current_byte_offset; + t.byte_end = t.byte_end + current_byte_offset; + (i + current_token_index_offset, t) + }) + .enumerate() + .map(|(i, t)| (i + current_word_offset, t)); + + for (word_pos, (token_index, token)) in tokens { + token_index_offset = token_index + 1; + word_offset = word_pos + 1; + byte_offset = token.byte_end + 1; + + let must_continue = index_token( + token, + token_index, + word_pos, + id, + indexed_pos, + self.word_limit, + &self.stop_words, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { + break; + } } } } @@ -114,6 +147,8 @@ impl> RawIndexer { fn index_token( token: Token, + position: usize, + word_pos: usize, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, @@ -123,20 +158,14 @@ fn index_token( ) -> bool where A: AsRef<[u8]>, { - if token.index >= word_limit { + if position >= word_limit { return false; } - let lower = token.word.to_lowercase(); - let token = Token { - word: &lower, - ..token - }; - - if !stop_words.contains(&token.word) { - match token_to_docindex(id, indexed_pos, token) { + if !stop_words.contains(&token.word.as_ref()) { + match token_to_docindex(id, indexed_pos, &token, word_pos) { Some(docindex) => { - let word = Vec::from(token.word); + let word = Vec::from(token.word.as_ref()); if word.len() <= WORD_LENGTH_LIMIT { words_doc_indexes @@ -144,20 +173,6 @@ where A: AsRef<[u8]>, .or_insert_with(Vec::new) .push(docindex); docs_words.entry(id).or_insert_with(Vec::new).push(word); - - if !lower.contains(is_cjk) { - let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower && !unidecoded.is_empty() { - let word = Vec::from(unidecoded); - if word.len() <= WORD_LENGTH_LIMIT { - words_doc_indexes - .entry(word.clone()) - .or_insert_with(Vec::new) - .push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); - } - } - } } } None => return false, @@ -167,8 +182,8 @@ where A: AsRef<[u8]>, true } -fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option { - let word_index = u16::try_from(token.word_index).ok()?; +fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option { + let word_index = u16::try_from(word_index).ok()?; let char_index = u16::try_from(token.char_index).ok()?; let char_length = u16::try_from(token.word.chars().count()).ok()?; diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 78893c47a..dd5e2c79f 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -11,7 +11,6 @@ use meilisearch_core::criterion::*; use meilisearch_core::settings::RankingRule; use meilisearch_core::{Highlight, Index, RankedMap}; use meilisearch_schema::{FieldId, Schema}; -use meilisearch_tokenizer::is_cjk; use serde::{Deserialize, Serialize}; use serde_json::Value; use siphasher::sip::SipHasher; @@ -344,7 +343,7 @@ pub struct SearchResult { /// returns the start index and the length on the crop. fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { - let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c); + let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c); let word_end_index = |mut index| { if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) { diff --git a/meilisearch-http/src/helpers/mod.rs b/meilisearch-http/src/helpers/mod.rs index 471336db9..9ba62a3a7 100644 --- a/meilisearch-http/src/helpers/mod.rs +++ b/meilisearch-http/src/helpers/mod.rs @@ -5,3 +5,22 @@ pub mod compression; pub use authentication::Authentication; pub use normalize_path::NormalizePath; + +pub fn is_cjk(c: char) -> bool { + (c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo + || (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement + || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical + || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation + || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana + || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana + || (c >= '\u{3100}' && c <= '\u{312f}') + || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo + || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months + || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A + || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs + || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A + || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables + || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B + || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs + || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana +} From 8843062604f1e94e7e4bddad6a2b94b6984c9f3a Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 24 Nov 2020 21:43:21 +0100 Subject: [PATCH 03/22] fix indexer tests --- meilisearch-core/Cargo.toml | 1 + meilisearch-core/src/automaton/mod.rs | 2 +- meilisearch-core/src/query_builder.rs | 2261 ++++++++--------- meilisearch-core/src/query_tree.rs | 2 +- meilisearch-core/src/raw_indexer.rs | 45 +- .../src/update/documents_addition.rs | 10 +- meilisearch-core/src/update/helpers.rs | 5 +- 7 files changed, 1155 insertions(+), 1171 deletions(-) diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index dbd369000..ecfecfdc1 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,6 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } +meilisearch-tokenizer = { path = "../../Tokenizer" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 27b63f25c..f31d0f0a5 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,4 +1,4 @@ mod dfa; - pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; + diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 21a15cc9c..f8c55189e 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -185,9 +185,7 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> { None => { match self.index.main.sorted_document_ids_cache(reader)? { // build result from cached document ids - Some(docids) => { - let mut sort_result = self.sort_result_from_docids(&docids, range); - + Some(docids) => { let mut sort_result = self.sort_result_from_docids(&docids, range); if let Some(f) = self.facet_count_docids(reader)? { sort_result.exhaustive_facets_count = Some(true); // document ids are not sorted in natural order, we need to construct a new set @@ -284,1131 +282,1132 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> { } } -#[cfg(test)] -mod tests { - use super::*; - - use std::collections::{BTreeSet, HashMap}; - use std::iter::FromIterator; - - use fst::IntoStreamer; - use meilisearch_schema::IndexedPos; - use sdset::SetBuf; - use tempfile::TempDir; - - use crate::automaton::normalize_str; - use crate::bucket_sort::SimpleMatch; - use crate::database::{Database, DatabaseOptions}; - use crate::store::Index; - use crate::DocIndex; - use crate::Document; - use meilisearch_schema::Schema; - - fn set_from_stream<'f, I, S>(stream: I) -> fst::Set> - where - I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, - S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, - { - let mut builder = fst::SetBuilder::memory(); - builder.extend_stream(stream).unwrap(); - builder.into_set() - } - - fn insert_key>(set: &fst::Set, key: &[u8]) -> fst::Set> { - let unique_key = { - let mut builder = fst::SetBuilder::memory(); - builder.insert(key).unwrap(); - builder.into_set() - }; - - let union_ = set.op().add(unique_key.into_stream()).r#union(); - - set_from_stream(union_) - } - - fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set> { - let mut builder = fst::SetBuilder::memory(); - let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); - builder.extend_iter(set.into_iter()).unwrap(); - builder.into_set() - } - - const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { - DocIndex { - document_id: DocumentId(document_id), - attribute: 0, - word_index, - char_index: 0, - char_length: 0, - } - } - - const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex { - DocIndex { - document_id: DocumentId(document_id), - attribute: 0, - word_index, - char_index, - char_length: 0, - } - } - - pub struct TempDatabase { - database: Database, - index: Index, - _tempdir: TempDir, - } - - impl TempDatabase { - pub fn query_builder(&self) -> QueryBuilder { - self.index.query_builder() - } - - pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { - let db = &self.database; - let mut writer = db.main_write_txn().unwrap(); - - let word = normalize_str(word); - - let alternatives = self - .index - .synonyms - .synonyms_fst(&writer, word.as_bytes()) - .unwrap(); - - let new = sdset_into_fstset(&new); - let new_alternatives = - set_from_stream(alternatives.op().add(new.into_stream()).r#union()); - self.index - .synonyms - .put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) - .unwrap(); - - let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); - - let synonyms_fst = insert_key(&synonyms, word.as_bytes()); - self.index - .main - .put_synonyms_fst(&mut writer, &synonyms_fst) - .unwrap(); - - writer.commit().unwrap(); - } - } - - impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { - fn from_iter>(iter: I) -> Self { - let tempdir = TempDir::new().unwrap(); - let database = Database::open_or_create(&tempdir, DatabaseOptions::default()).unwrap(); - let index = database.create_index("default").unwrap(); - - let db = &database; - let mut writer = db.main_write_txn().unwrap(); - - let mut words_fst = BTreeSet::new(); - let mut postings_lists = HashMap::new(); - let mut fields_counts = HashMap::<_, u16>::new(); - - let mut schema = Schema::with_primary_key("id"); - - for (word, indexes) in iter { - let mut final_indexes = Vec::new(); - for index in indexes { - let name = index.attribute.to_string(); - let indexed_pos = schema.insert_with_position(&name).unwrap().1; - let index = DocIndex { - attribute: indexed_pos.0, - ..*index - }; - final_indexes.push(index); - } - - let word = word.to_lowercase().into_bytes(); - words_fst.insert(word.clone()); - postings_lists - .entry(word) - .or_insert_with(Vec::new) - .extend_from_slice(&final_indexes); - for idx in final_indexes { - fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); - } - } - - index.main.put_schema(&mut writer, &schema).unwrap(); - - let words_fst = fst::Set::from_iter(words_fst).unwrap(); - - index.main.put_words_fst(&mut writer, &words_fst).unwrap(); - - for (word, postings_list) in postings_lists { - let postings_list = SetBuf::from_dirty(postings_list); - index - .postings_lists - .put_postings_list(&mut writer, &word, &postings_list) - .unwrap(); - } - - for ((docid, attr, _), count) in fields_counts { - let prev = index - .documents_fields_counts - .document_field_count(&writer, docid, IndexedPos(attr)) - .unwrap(); - - let prev = prev.unwrap_or(0); - - index - .documents_fields_counts - .put_document_field_count(&mut writer, docid, IndexedPos(attr), prev + count) - .unwrap(); - } - - writer.commit().unwrap(); - - TempDatabase { database, index, _tempdir: tempdir } - } - } - - #[test] - fn simple() { - let store = TempDatabase::from_iter(vec![ - ("iphone", &[doc_char_index(0, 0, 0)][..]), - ("from", &[doc_char_index(0, 1, 1)][..]), - ("apple", &[doc_char_index(0, 2, 2)][..]), - ]); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("iphone from apple"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn simple_synonyms() { - let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - // #[test] - // fn prefix_synonyms() { - // let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - // store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); - // store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); - - // let db = &store.database; - // let reader = db.main_read_txn().unwrap(); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "sal", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - // let mut matches = matches.into_iter(); - // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - // assert_matches!(matches.next(), None); - // }); - // assert_matches!(iter.next(), None); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "bonj", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - // let mut matches = matches.into_iter(); - // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - // assert_matches!(matches.next(), None); - // }); - // assert_matches!(iter.next(), None); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "sal blabla", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), None); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "bonj blabla", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), None); - // } - - // #[test] - // fn levenshtein_synonyms() { - // let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - // store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); - - // let db = &store.database; - // let reader = db.main_read_txn().unwrap(); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "salutution", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - // let mut matches = matches.into_iter(); - // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - // assert_matches!(matches.next(), None); - // }); - // assert_matches!(iter.next(), None); - - // let builder = store.query_builder(); - // let results = builder.query(&reader, "saluttion", 0..20).unwrap(); - // let mut iter = documents.into_iter(); - - // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - // let mut matches = matches.into_iter(); - // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - // assert_matches!(matches.next(), None); - // }); - // assert_matches!(iter.next(), None); - // } - - #[test] - fn harder_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("hello", &[doc_index(0, 0)][..]), - ("bonjour", &[doc_index(1, 3)]), - ("salut", &[doc_index(2, 5)]), - ]); - - store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); - store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); - store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("salut"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - /// Unique word has multi-word synonyms - fn unique_to_multiword_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("subway", &[doc_char_index(0, 3, 3)][..]), - ("NY", &[doc_char_index(1, 0, 0)][..]), - ("subway", &[doc_char_index(1, 1, 1)][..]), - ]); - - store.add_synonym( - "NY", - SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - ); - store.add_synonym( - "NYC", - SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - ); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn unique_to_multiword_synonyms_words_proximity() { - let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("subway", &[doc_char_index(0, 3, 3)][..]), - ("york", &[doc_char_index(1, 0, 0)][..]), - ("new", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ("NY", &[doc_char_index(2, 0, 0)][..]), - ("subway", &[doc_char_index(2, 1, 1)][..]), - ]); - - store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NY"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("new york"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york - assert_matches!(matches.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn unique_to_multiword_synonyms_cumulative_word_index() { - let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_char_index(0, 0, 0)][..]), - ("subway", &[doc_char_index(0, 1, 1)][..]), - ("new", &[doc_char_index(1, 0, 0)][..]), - ("york", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ]); - - store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - // assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - // let mut matches = matches.into_iter(); - // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway - // assert_matches!(matches.next(), None); - // }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = - builder.query(&reader, Some("new york subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - /// Unique word has multi-word synonyms - fn harder_unique_to_multiword_synonyms_one() { - let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("yellow", &[doc_char_index(0, 3, 3)][..]), - ("subway", &[doc_char_index(0, 4, 4)][..]), - ("broken", &[doc_char_index(0, 5, 5)][..]), - ("NY", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ]); - - store.add_synonym( - "NY", - SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - ); - store.add_synonym( - "NYC", - SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - ); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC - // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), None); - } - - #[test] - /// Unique word has multi-word synonyms - fn even_harder_unique_to_multiword_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("yellow", &[doc_char_index(0, 3, 3)][..]), - ("underground", &[doc_char_index(0, 4, 4)][..]), - ("train", &[doc_char_index(0, 5, 5)][..]), - ("broken", &[doc_char_index(0, 6, 6)][..]), - ("NY", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ]); - - store.add_synonym( - "NY", - SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - ); - store.add_synonym( - "NYC", - SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - ); - store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult {documents, .. } = builder.query(&reader, Some("NY subway broken"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway - assert_matches!(iter.next(), None); // position rewritten ^ - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - /// Multi-word has multi-word synonyms - fn multiword_to_multiword_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_char_index(0, 0, 0)][..]), - ("subway", &[doc_char_index(0, 1, 1)][..]), - ("NYC", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ("broken", &[doc_char_index(1, 3, 3)][..]), - ("new", &[doc_char_index(2, 0, 0)][..]), - ("york", &[doc_char_index(2, 1, 1)][..]), - ("underground", &[doc_char_index(2, 2, 2)][..]), - ("train", &[doc_char_index(2, 3, 3)][..]), - ("broken", &[doc_char_index(2, 4, 4)][..]), - ]); - - store.add_synonym( - "new york", - SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]), - ); - store.add_synonym( - "new york city", - SetBuf::from_dirty(vec!["NYC", "NY", "new york"]), - ); - store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder - .query(&reader, Some("new york underground train broken"), 0..20) - .unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder - .query(&reader, Some("new york city underground train broken"), 0..20) - .unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn intercrossed_multiword_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_index(0, 0)][..]), - ("york", &[doc_index(0, 1)][..]), - ("big", &[doc_index(0, 2)][..]), - ("city", &[doc_index(0, 3)][..]), - ]); - - store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"])); - store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("new york big "), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_index(0, 0)][..]), - ("city", &[doc_index(0, 1)][..]), - ("subway", &[doc_index(0, 2)][..]), - ("NY", &[doc_index(1, 0)][..]), - ("subway", &[doc_index(1, 1)][..]), - ("NY", &[doc_index(2, 0)][..]), - ("york", &[doc_index(2, 1)][..]), - ("city", &[doc_index(2, 2)][..]), - ("subway", &[doc_index(2, 3)][..]), - ]); - - store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("NY subway "), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn cumulative_word_indices() { - let mut store = TempDatabase::from_iter(vec![ - ("NYC", &[doc_index(0, 0)][..]), - ("long", &[doc_index(0, 1)][..]), - ("subway", &[doc_index(0, 2)][..]), - ("cool", &[doc_index(0, 3)][..]), - ]); - - store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); - store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder - .query(&reader, Some("new york city long subway cool "), 0..20) - .unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn deunicoded_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("telephone", &[doc_index(0, 0)][..]), // meilisearch indexes the unidecoded - ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex - ("iphone", &[doc_index(1, 0)][..]), - ]); - - store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("téléphone"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn simple_concatenation() { - let store = TempDatabase::from_iter(vec![ - ("iphone", &[doc_index(0, 0)][..]), - ("case", &[doc_index(0, 1)][..]), - ]); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("i phone case"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone - // assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone" - // but no typo on first letter ^^^^^^^ - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn exact_field_count_one_word() { - let store = TempDatabase::from_iter(vec![ - ("searchengine", &[doc_index(0, 0)][..]), - ("searchengine", &[doc_index(1, 0)][..]), - ("blue", &[doc_index(1, 1)][..]), - ("searchangine", &[doc_index(2, 0)][..]), - ("searchengine", &[doc_index(3, 0)][..]), - ]); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn simple_phrase_query_splitting() { - let store = TempDatabase::from_iter(vec![ - ("search", &[doc_index(0, 0)][..]), - ("engine", &[doc_index(0, 1)][..]), - ("search", &[doc_index(1, 0)][..]), - ("slow", &[doc_index(1, 1)][..]), - ("engine", &[doc_index(1, 2)][..]), - ]); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } - - #[test] - fn harder_phrase_query_splitting() { - let store = TempDatabase::from_iter(vec![ - ("search", &[doc_index(0, 0)][..]), - ("search", &[doc_index(0, 1)][..]), - ("engine", &[doc_index(0, 2)][..]), - ("search", &[doc_index(1, 0)][..]), - ("slow", &[doc_index(1, 1)][..]), - ("search", &[doc_index(1, 2)][..]), - ("engine", &[doc_index(1, 3)][..]), - ("search", &[doc_index(1, 0)][..]), - ("search", &[doc_index(1, 1)][..]), - ("slow", &[doc_index(1, 2)][..]), - ("engine", &[doc_index(1, 3)][..]), - ]); - - let db = &store.database; - let reader = db.main_read_txn().unwrap(); - - let builder = store.query_builder(); - let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - let mut iter = documents.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), None); - } -} +//#[cfg(test)] +//mod tests { + //use super::*; + + //use std::collections::{BTreeSet, HashMap}; + //use std::iter::FromIterator; + + //use fst::IntoStreamer; + //use meilisearch_schema::IndexedPos; + //use sdset::SetBuf; + //use tempfile::TempDir; + + //use crate::automaton::normalize_str; + //use crate::bucket_sort::SimpleMatch; + //use crate::database::{Database, DatabaseOptions}; + //use crate::store::Index; + //use crate::DocIndex; + //use crate::Document; + //use meilisearch_schema::Schema; + + //fn set_from_stream<'f, I, S>(stream: I) -> fst::Set> + //where + //I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, + //S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, + //{ + //let mut builder = fst::SetBuilder::memory(); + //builder.extend_stream(stream).unwrap(); + //builder.into_set() + //} + + //fn insert_key>(set: &fst::Set, key: &[u8]) -> fst::Set> { + //let unique_key = { + //let mut builder = fst::SetBuilder::memory(); + //builder.insert(key).unwrap(); + //builder.into_set() + //}; + + //let union_ = set.op().add(unique_key.into_stream()).r#union(); + + //set_from_stream(union_) + //} + + //fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set> { + //let mut builder = fst::SetBuilder::memory(); + //let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); + //builder.extend_iter(set.into_iter()).unwrap(); + //builder.into_set() + //} + + //const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { + //DocIndex { + //document_id: DocumentId(document_id), + //attribute: 0, + //word_index, + //char_index: 0, + //char_length: 0, + //} + //} + + //const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex { + //DocIndex { + //document_id: DocumentId(document_id), + //attribute: 0, + //word_index, + //char_index, + //char_length: 0, + //} + //} + + //pub struct TempDatabase { + //database: Database, + //index: Index, + //_tempdir: TempDir, + //} + + //impl TempDatabase { + //pub fn query_builder(&self) -> QueryBuilder { + //self.index.query_builder() + //} + + //pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { + //let db = &self.database; + //let mut writer = db.main_write_txn().unwrap(); + + //let word = normalize_str(word); + + //let alternatives = self + //.index + //.synonyms + //.synonyms_fst(&writer, word.as_bytes()) + //.unwrap(); + + //let new = sdset_into_fstset(&new); + //let new_alternatives = + //set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + //self.index + //.synonyms + //.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) + //.unwrap(); + + //let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); + + //let synonyms_fst = insert_key(&synonyms, word.as_bytes()); + //self.index + //.main + //.put_synonyms_fst(&mut writer, &synonyms_fst) + //.unwrap(); + + //writer.commit().unwrap(); + //} + //} + + //impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { + //fn from_iter>(iter: I) -> Self { + //let tempdir = TempDir::new().unwrap(); + //let database = Database::open_or_create(&tempdir, DatabaseOptions::default()).unwrap(); + //let index = database.create_index("default").unwrap(); + + //let db = &database; + //let mut writer = db.main_write_txn().unwrap(); + + //let mut words_fst = BTreeSet::new(); + //let mut postings_lists = HashMap::new(); + //let mut fields_counts = HashMap::<_, u16>::new(); + + //let mut schema = Schema::with_primary_key("id"); + + //for (word, indexes) in iter { + //let mut final_indexes = Vec::new(); + //for index in indexes { + //let name = index.attribute.to_string(); + //schema.insert(&name).unwrap(); + //let indexed_pos = schema.set_indexed(&name).unwrap().1; + //let index = DocIndex { + //attribute: indexed_pos.0, + //..*index + //}; + //final_indexes.push(index); + //} + + //let word = word.to_lowercase().into_bytes(); + //words_fst.insert(word.clone()); + //postings_lists + //.entry(word) + //.or_insert_with(Vec::new) + //.extend_from_slice(&final_indexes); + //for idx in final_indexes { + //fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); + //} + //} + + //index.main.put_schema(&mut writer, &schema).unwrap(); + + //let words_fst = fst::Set::from_iter(words_fst).unwrap(); + + //index.main.put_words_fst(&mut writer, &words_fst).unwrap(); + + //for (word, postings_list) in postings_lists { + //let postings_list = SetBuf::from_dirty(postings_list); + //index + //.postings_lists + //.put_postings_list(&mut writer, &word, &postings_list) + //.unwrap(); + //} + + //for ((docid, attr, _), count) in fields_counts { + //let prev = index + //.documents_fields_counts + //.document_field_count(&writer, docid, IndexedPos(attr)) + //.unwrap(); + + //let prev = prev.unwrap_or(0); + + //index + //.documents_fields_counts + //.put_document_field_count(&mut writer, docid, IndexedPos(attr), prev + count) + //.unwrap(); + //} + + //writer.commit().unwrap(); + + //TempDatabase { database, index, _tempdir: tempdir } + //} + //} + + //#[test] + //fn simple() { + //let store = TempDatabase::from_iter(vec![ + //("iphone", &[doc_char_index(0, 0, 0)][..]), + //("from", &[doc_char_index(0, 1, 1)][..]), + //("apple", &[doc_char_index(0, 2, 2)][..]), + //]); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("iphone from apple"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn simple_synonyms() { + //let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + //store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //// #[test] + //// fn prefix_synonyms() { + //// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + //// store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + //// store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); + + //// let db = &store.database; + //// let reader = db.main_read_txn().unwrap(); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "sal", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //// let mut matches = matches.into_iter(); + //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //// assert_matches!(matches.next(), None); + //// }); + //// assert_matches!(iter.next(), None); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "bonj", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //// let mut matches = matches.into_iter(); + //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //// assert_matches!(matches.next(), None); + //// }); + //// assert_matches!(iter.next(), None); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "sal blabla", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), None); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "bonj blabla", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), None); + //// } + + //// #[test] + //// fn levenshtein_synonyms() { + //// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + //// store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); + + //// let db = &store.database; + //// let reader = db.main_read_txn().unwrap(); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "salutution", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //// let mut matches = matches.into_iter(); + //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //// assert_matches!(matches.next(), None); + //// }); + //// assert_matches!(iter.next(), None); + + //// let builder = store.query_builder(); + //// let results = builder.query(&reader, "saluttion", 0..20).unwrap(); + //// let mut iter = documents.into_iter(); + + //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //// let mut matches = matches.into_iter(); + //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //// assert_matches!(matches.next(), None); + //// }); + //// assert_matches!(iter.next(), None); + //// } + + //#[test] + //fn harder_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("hello", &[doc_index(0, 0)][..]), + //("bonjour", &[doc_index(1, 3)]), + //("salut", &[doc_index(2, 5)]), + //]); + + //store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); + //store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); + //store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("salut"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + ///// Unique word has multi-word synonyms + //fn unique_to_multiword_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("new", &[doc_char_index(0, 0, 0)][..]), + //("york", &[doc_char_index(0, 1, 1)][..]), + //("city", &[doc_char_index(0, 2, 2)][..]), + //("subway", &[doc_char_index(0, 3, 3)][..]), + //("NY", &[doc_char_index(1, 0, 0)][..]), + //("subway", &[doc_char_index(1, 1, 1)][..]), + //]); + + //store.add_synonym( + //"NY", + //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + //); + //store.add_synonym( + //"NYC", + //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + //); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn unique_to_multiword_synonyms_words_proximity() { + //let mut store = TempDatabase::from_iter(vec![ + //("new", &[doc_char_index(0, 0, 0)][..]), + //("york", &[doc_char_index(0, 1, 1)][..]), + //("city", &[doc_char_index(0, 2, 2)][..]), + //("subway", &[doc_char_index(0, 3, 3)][..]), + //("york", &[doc_char_index(1, 0, 0)][..]), + //("new", &[doc_char_index(1, 1, 1)][..]), + //("subway", &[doc_char_index(1, 2, 2)][..]), + //("NY", &[doc_char_index(2, 0, 0)][..]), + //("subway", &[doc_char_index(2, 1, 1)][..]), + //]); + + //store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NY"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("new york"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york + //assert_matches!(matches.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn unique_to_multiword_synonyms_cumulative_word_index() { + //let mut store = TempDatabase::from_iter(vec![ + //("NY", &[doc_char_index(0, 0, 0)][..]), + //("subway", &[doc_char_index(0, 1, 1)][..]), + //("new", &[doc_char_index(1, 0, 0)][..]), + //("york", &[doc_char_index(1, 1, 1)][..]), + //("subway", &[doc_char_index(1, 2, 2)][..]), + //]); + + //store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //// assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //// let mut matches = matches.into_iter(); + //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + //// assert_matches!(matches.next(), None); + //// }); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = + //builder.query(&reader, Some("new york subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + ///// Unique word has multi-word synonyms + //fn harder_unique_to_multiword_synonyms_one() { + //let mut store = TempDatabase::from_iter(vec![ + //("new", &[doc_char_index(0, 0, 0)][..]), + //("york", &[doc_char_index(0, 1, 1)][..]), + //("city", &[doc_char_index(0, 2, 2)][..]), + //("yellow", &[doc_char_index(0, 3, 3)][..]), + //("subway", &[doc_char_index(0, 4, 4)][..]), + //("broken", &[doc_char_index(0, 5, 5)][..]), + //("NY", &[doc_char_index(1, 0, 0)][..]), + //("blue", &[doc_char_index(1, 1, 1)][..]), + //("subway", &[doc_char_index(1, 2, 2)][..]), + //]); + + //store.add_synonym( + //"NY", + //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + //); + //store.add_synonym( + //"NYC", + //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + //); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + //// because one-word to one-word ^^^^ + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + ///// Unique word has multi-word synonyms + //fn even_harder_unique_to_multiword_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("new", &[doc_char_index(0, 0, 0)][..]), + //("york", &[doc_char_index(0, 1, 1)][..]), + //("city", &[doc_char_index(0, 2, 2)][..]), + //("yellow", &[doc_char_index(0, 3, 3)][..]), + //("underground", &[doc_char_index(0, 4, 4)][..]), + //("train", &[doc_char_index(0, 5, 5)][..]), + //("broken", &[doc_char_index(0, 6, 6)][..]), + //("NY", &[doc_char_index(1, 0, 0)][..]), + //("blue", &[doc_char_index(1, 1, 1)][..]), + //("subway", &[doc_char_index(1, 2, 2)][..]), + //]); + + //store.add_synonym( + //"NY", + //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + //); + //store.add_synonym( + //"NYC", + //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + //); + //store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult {documents, .. } = builder.query(&reader, Some("NY subway broken"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + //assert_matches!(iter.next(), None); // position rewritten ^ + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + //// because one-word to one-word ^^^^ + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + ///// Multi-word has multi-word synonyms + //fn multiword_to_multiword_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("NY", &[doc_char_index(0, 0, 0)][..]), + //("subway", &[doc_char_index(0, 1, 1)][..]), + //("NYC", &[doc_char_index(1, 0, 0)][..]), + //("blue", &[doc_char_index(1, 1, 1)][..]), + //("subway", &[doc_char_index(1, 2, 2)][..]), + //("broken", &[doc_char_index(1, 3, 3)][..]), + //("new", &[doc_char_index(2, 0, 0)][..]), + //("york", &[doc_char_index(2, 1, 1)][..]), + //("underground", &[doc_char_index(2, 2, 2)][..]), + //("train", &[doc_char_index(2, 3, 3)][..]), + //("broken", &[doc_char_index(2, 4, 4)][..]), + //]); + + //store.add_synonym( + //"new york", + //SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]), + //); + //store.add_synonym( + //"new york city", + //SetBuf::from_dirty(vec!["NYC", "NY", "new york"]), + //); + //store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder + //.query(&reader, Some("new york underground train broken"), 0..20) + //.unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder + //.query(&reader, Some("new york city underground train broken"), 0..20) + //.unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn intercrossed_multiword_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("new", &[doc_index(0, 0)][..]), + //("york", &[doc_index(0, 1)][..]), + //("big", &[doc_index(0, 2)][..]), + //("city", &[doc_index(0, 3)][..]), + //]); + + //store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"])); + //store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("new york big "), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let mut store = TempDatabase::from_iter(vec![ + //("NY", &[doc_index(0, 0)][..]), + //("city", &[doc_index(0, 1)][..]), + //("subway", &[doc_index(0, 2)][..]), + //("NY", &[doc_index(1, 0)][..]), + //("subway", &[doc_index(1, 1)][..]), + //("NY", &[doc_index(2, 0)][..]), + //("york", &[doc_index(2, 1)][..]), + //("city", &[doc_index(2, 2)][..]), + //("subway", &[doc_index(2, 3)][..]), + //]); + + //store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway "), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn cumulative_word_indices() { + //let mut store = TempDatabase::from_iter(vec![ + //("NYC", &[doc_index(0, 0)][..]), + //("long", &[doc_index(0, 1)][..]), + //("subway", &[doc_index(0, 2)][..]), + //("cool", &[doc_index(0, 3)][..]), + //]); + + //store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); + //store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder + //.query(&reader, Some("new york city long subway cool "), 0..20) + //.unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut matches = matches.into_iter(); + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + //assert_matches!(matches.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn deunicoded_synonyms() { + //let mut store = TempDatabase::from_iter(vec![ + //("telephone", &[doc_index(0, 0)][..]), // meilisearch indexes the unidecoded + //("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex + //("iphone", &[doc_index(1, 0)][..]), + //]); + + //store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("téléphone"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn simple_concatenation() { + //let store = TempDatabase::from_iter(vec![ + //("iphone", &[doc_index(0, 0)][..]), + //("case", &[doc_index(0, 1)][..]), + //]); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("i phone case"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone + //// assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone" + //// but no typo on first letter ^^^^^^^ + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn exact_field_count_one_word() { + //let store = TempDatabase::from_iter(vec![ + //("searchengine", &[doc_index(0, 0)][..]), + //("searchengine", &[doc_index(1, 0)][..]), + //("blue", &[doc_index(1, 1)][..]), + //("searchangine", &[doc_index(2, 0)][..]), + //("searchengine", &[doc_index(3, 0)][..]), + //]); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn simple_phrase_query_splitting() { + //let store = TempDatabase::from_iter(vec![ + //("search", &[doc_index(0, 0)][..]), + //("engine", &[doc_index(0, 1)][..]), + //("search", &[doc_index(1, 0)][..]), + //("slow", &[doc_index(1, 1)][..]), + //("engine", &[doc_index(1, 2)][..]), + //]); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} + + //#[test] + //fn harder_phrase_query_splitting() { + //let store = TempDatabase::from_iter(vec![ + //("search", &[doc_index(0, 0)][..]), + //("search", &[doc_index(0, 1)][..]), + //("engine", &[doc_index(0, 2)][..]), + //("search", &[doc_index(1, 0)][..]), + //("slow", &[doc_index(1, 1)][..]), + //("search", &[doc_index(1, 2)][..]), + //("engine", &[doc_index(1, 3)][..]), + //("search", &[doc_index(1, 0)][..]), + //("search", &[doc_index(1, 1)][..]), + //("slow", &[doc_index(1, 2)][..]), + //("engine", &[doc_index(1, 3)][..]), + //]); + + //let db = &store.database; + //let reader = db.main_read_txn().unwrap(); + + //let builder = store.query_builder(); + //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + //let mut iter = documents.into_iter(); + + //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + //let mut iter = matches.into_iter(); + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search + //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine + //assert_matches!(iter.next(), None); + //}); + //assert_matches!(iter.next(), None); + //} +//} diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 4b4772036..cb3921567 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -9,7 +9,7 @@ use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; use meilisearch_tokenizer::Token; -use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; use crate::database::MainT; diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index e234ca736..dd7743e53 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -1,9 +1,10 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; +use std::println; use meilisearch_schema::IndexedPos; -use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::Token; use sdset::SetBuf; @@ -14,9 +15,8 @@ const WORD_LENGTH_LIMIT: usize = 80; type Word = Vec; // TODO make it be a SmallVec -pub struct RawIndexer { +pub struct RawIndexer { word_limit: usize, // the maximum number of indexed words - stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, analyzer: Analyzer, @@ -27,28 +27,26 @@ pub struct Indexed<'a> { pub docs_words: HashMap>, } -impl RawIndexer { - pub fn new(stop_words: fst::Set) -> RawIndexer { +impl RawIndexer { + pub fn new>(stop_words: fst::Set) -> RawIndexer { RawIndexer::with_word_limit(stop_words, 1000) } - pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { + pub fn with_word_limit>(stop_words: fst::Set, limit: usize) -> RawIndexer { RawIndexer { word_limit: limit, - stop_words, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), - analyzer: Analyzer::new(AnalyzerConfig::default()), + analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect())) } } -} -impl> RawIndexer { pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() { + for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() { + print!("token: {}", token.word); let must_continue = index_token( token, token_index, @@ -56,7 +54,6 @@ impl> RawIndexer { id, indexed_pos, self.word_limit, - &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -88,6 +85,7 @@ impl> RawIndexer { let tokens = analyzed_text .tokens() .enumerate() + .filter(|(_, t)| t.is_word()) .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; @@ -103,12 +101,11 @@ impl> RawIndexer { let must_continue = index_token( token, - token_index, word_pos, + token_index, id, indexed_pos, self.word_limit, - &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -145,24 +142,23 @@ impl> RawIndexer { } } -fn index_token( +fn index_token( token: Token, position: usize, word_pos: usize, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, - stop_words: &fst::Set, words_doc_indexes: &mut BTreeMap>, docs_words: &mut HashMap>, ) -> bool -where A: AsRef<[u8]>, { - if position >= word_limit { + println!(" position {}, limit: {}", position, word_limit); + if word_pos >= word_limit { return false; } - if !stop_words.contains(&token.word.as_ref()) { + if !token.is_stopword() { match token_to_docindex(id, indexed_pos, &token, word_pos) { Some(docindex) => { let word = Vec::from(token.word.as_ref()); @@ -220,9 +216,6 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - assert!(words_doc_indexes - .get(&"éteindre".to_owned().into_bytes()) - .is_some()); } #[test] @@ -242,9 +235,6 @@ mod tests { assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - assert!(words_doc_indexes - .get(&"éteindre".to_owned().into_bytes()) - .is_some()); } #[test] @@ -269,9 +259,6 @@ mod tests { assert!(words_doc_indexes.get(&b"ai"[..]).is_none()); assert!(words_doc_indexes.get(&b"de"[..]).is_none()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); - assert!(words_doc_indexes - .get(&"éteindre".to_owned().into_bytes()) - .is_some()); } #[test] @@ -303,7 +290,7 @@ mod tests { let Indexed { words_doc_indexes, .. } = indexer.build(); - assert!(words_doc_indexes.get(&"buffering".to_owned().into_bytes()).is_some()); + assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some()); } #[test] diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index b783ae978..fc999a6cb 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -110,18 +110,17 @@ pub fn push_documents_addition( } #[allow(clippy::too_many_arguments)] -fn index_document( +fn index_document( writer: &mut heed::RwTxn, documents_fields: DocumentsFields, documents_fields_counts: DocumentsFieldsCounts, ranked_map: &mut RankedMap, - indexer: &mut RawIndexer, + indexer: &mut RawIndexer, schema: &Schema, field_id: FieldId, document_id: DocumentId, value: &Value, ) -> MResult<()> -where A: AsRef<[u8]>, { let serialized = serde_json::to_vec(value)?; documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; @@ -373,14 +372,13 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind Ok(()) } -pub fn write_documents_addition_index( +pub fn write_documents_addition_index( writer: &mut heed::RwTxn, index: &store::Index, ranked_map: &RankedMap, number_of_inserted_documents: usize, - indexer: RawIndexer, + indexer: RawIndexer, ) -> MResult<()> -where A: AsRef<[u8]>, { let indexed = indexer.build(); let mut delta_words_builder = SetBuilder::memory(); diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index 1aad1f505..951480ee1 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -12,13 +12,12 @@ use crate::serde::SerializerError; use crate::store::DiscoverIds; /// Returns the number of words indexed or `None` if the type is unindexable. -pub fn index_value( - indexer: &mut RawIndexer, +pub fn index_value( + indexer: &mut RawIndexer, document_id: DocumentId, indexed_pos: IndexedPos, value: &Value, ) -> Option -where A: AsRef<[u8]>, { match value { Value::Null => None, From e616b1e356417ef4daf4f263b870bbfd60aad1ce Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 10:18:36 +0100 Subject: [PATCH 04/22] hard separator offset --- meilisearch-core/src/database.rs | 8 +++--- meilisearch-core/src/query_tree.rs | 25 ++++++++++++++----- meilisearch-core/src/raw_indexer.rs | 38 +++++++++++++++++------------ 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/meilisearch-core/src/database.rs b/meilisearch-core/src/database.rs index 94563a5a9..da8d44d6a 100644 --- a/meilisearch-core/src/database.rs +++ b/meilisearch-core/src/database.rs @@ -193,8 +193,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> { Err(Error::VersionMismatch(format!("{}.{}.XX", version_major, version_minor))) } else { Ok(( - version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, - version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))? )) } @@ -212,8 +212,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> { current_version_patch).as_bytes())?; Ok(( - current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, - current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, current_version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))? )) } else { diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index cb3921567..f16f431fa 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::hash::{Hash, Hasher}; use std::ops::Range; use std::time::Instant; @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::{Token, token::SeparatorKind}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; @@ -175,13 +175,20 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; -fn split_query_string(s: &str) -> Vec<(usize, String)> { +fn split_query_string(s: &str, stop_words: HashSet) -> Vec<(usize, String)> { // TODO: Use global instance instead - let analyzer = Analyzer::new(AnalyzerConfig::default()); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); analyzer .analyze(s) .tokens() - .filter(|t| !t.is_stopword()) + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) .enumerate() .map(|(i, Token { word, .. })| (i, word.to_string())) .collect() @@ -193,7 +200,13 @@ pub fn create_query_tree( query: &str, ) -> MResult<(Operation, HashMap>)> { - let words = split_query_string(query); + // TODO: use a shared analyzer instance + let words = split_query_string(query, ctx.stop_words + .stream() + .into_strs() + .unwrap_or_default() + .into_iter(). + collect()); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index dd7743e53..510717f4d 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -1,11 +1,10 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; -use std::println; use meilisearch_schema::IndexedPos; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; -use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::{Token, token::SeparatorKind}; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -45,11 +44,18 @@ impl RawIndexer { let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() { - print!("token: {}", token.word); + for (word_pos, token) in analyzed_text.tokens() + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) + .enumerate() { let must_continue = index_token( token, - token_index, word_pos, id, indexed_pos, @@ -72,37 +78,39 @@ impl RawIndexer { where I: IntoIterator, { - let mut token_index_offset = 0; let mut byte_offset = 0; let mut word_offset = 0; for s in iter.into_iter() { - let current_token_index_offset = token_index_offset; let current_byte_offset = byte_offset; let current_word_offset = word_offset; let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .enumerate() - .filter(|(_, t)| t.is_word()) - .map(|(i, mut t)| { + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) + .map(|mut t| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - (i + current_token_index_offset, t) + t }) .enumerate() .map(|(i, t)| (i + current_word_offset, t)); - for (word_pos, (token_index, token)) in tokens { - token_index_offset = token_index + 1; + for (word_pos, token) in tokens { word_offset = word_pos + 1; byte_offset = token.byte_end + 1; let must_continue = index_token( token, word_pos, - token_index, id, indexed_pos, self.word_limit, @@ -144,7 +152,6 @@ impl RawIndexer { fn index_token( token: Token, - position: usize, word_pos: usize, id: DocumentId, indexed_pos: IndexedPos, @@ -153,7 +160,6 @@ fn index_token( docs_words: &mut HashMap>, ) -> bool { - println!(" position {}, limit: {}", position, word_limit); if word_pos >= word_limit { return false; } From 6527d3e492f0dc91966fa5a1f16853608f38a75c Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 13:16:12 +0100 Subject: [PATCH 05/22] better separator handling --- meilisearch-core/src/query_tree.rs | 22 +++++++++++---- meilisearch-core/src/raw_indexer.rs | 42 ++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index f16f431fa..9be02d337 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::{Token, token::SeparatorKind}; +use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; @@ -181,10 +181,22 @@ fn split_query_string(s: &str, stop_words: HashSet) -> Vec<(usize, Strin analyzer .analyze(s) .tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) }) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 510717f4d..dd47ed5f2 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -4,7 +4,7 @@ use std::convert::TryFrom; use meilisearch_schema::IndexedPos; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; -use meilisearch_tokenizer::{Token, token::SeparatorKind}; +use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -45,10 +45,22 @@ impl RawIndexer { let analyzed_text = self.analyzer.analyze(text); for (word_pos, token) in analyzed_text.tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) }) @@ -88,10 +100,22 @@ impl RawIndexer { let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .scan(0, |offset, mut token| { - token.char_index += *offset; - if let Some(SeparatorKind::Hard) = token.is_separator() { - *offset += 8; + .scan((0, None), |(offset, sepcat), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + if let Some(SeparatorKind::Hard) = sepcat { + *offset += 8; + } + *sepcat = None; + token.char_index += *offset; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { + *sepcat = Some(SeparatorKind::Soft); + } + _ => (), } Some(token) }) From 206308c1aa8374650df2ebeedaa674ea5f2ea9dc Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 15:17:49 +0100 Subject: [PATCH 06/22] replace hashset with fst::Set --- meilisearch-core/src/query_tree.rs | 11 ++---- meilisearch-core/src/raw_indexer.rs | 35 ++++++++++++------- .../src/update/documents_addition.rs | 12 +++---- meilisearch-core/src/update/helpers.rs | 4 +-- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 9be02d337..aae80e395 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::ops::Range; use std::time::Instant; @@ -175,7 +175,7 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; -fn split_query_string(s: &str, stop_words: HashSet) -> Vec<(usize, String)> { +fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set) -> Vec<(usize, String)> { // TODO: Use global instance instead let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); analyzer @@ -213,12 +213,7 @@ pub fn create_query_tree( ) -> MResult<(Operation, HashMap>)> { // TODO: use a shared analyzer instance - let words = split_query_string(query, ctx.stop_words - .stream() - .into_strs() - .unwrap_or_default() - .into_iter(). - collect()); + let words = split_query_string(query, &ctx.stop_words); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index dd47ed5f2..0266772f6 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -14,11 +14,14 @@ const WORD_LENGTH_LIMIT: usize = 80; type Word = Vec; // TODO make it be a SmallVec -pub struct RawIndexer { +pub struct RawIndexer<'a, A> +where + A: AsRef<[u8]> +{ word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, docs_words: HashMap>, - analyzer: Analyzer, + analyzer: Analyzer<'a, A>, } pub struct Indexed<'a> { @@ -26,17 +29,20 @@ pub struct Indexed<'a> { pub docs_words: HashMap>, } -impl RawIndexer { - pub fn new>(stop_words: fst::Set) -> RawIndexer { +impl<'a, A> RawIndexer<'a, A> +where + A: AsRef<[u8]> +{ + pub fn new(stop_words: &'a fst::Set) -> RawIndexer<'a, A> { RawIndexer::with_word_limit(stop_words, 1000) } - pub fn with_word_limit>(stop_words: fst::Set, limit: usize) -> RawIndexer { + pub fn with_word_limit(stop_words: &'a fst::Set, limit: usize) -> RawIndexer { RawIndexer { word_limit: limit, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), - analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect())) + analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)), } } @@ -231,7 +237,8 @@ mod tests { #[test] fn strange_apostrophe() { - let mut indexer = RawIndexer::new(fst::Set::default()); + let stop_words = fst::Set::default(); + let mut indexer = RawIndexer::new(&stop_words); let docid = DocumentId(0); let indexed_pos = IndexedPos(0); @@ -250,7 +257,8 @@ mod tests { #[test] fn strange_apostrophe_in_sequence() { - let mut indexer = RawIndexer::new(fst::Set::default()); + let stop_words = fst::Set::default(); + let mut indexer = RawIndexer::new(&stop_words); let docid = DocumentId(0); let indexed_pos = IndexedPos(0); @@ -272,7 +280,7 @@ mod tests { let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]); let stop_words = fst::Set::from_iter(stop_words).unwrap(); - let mut indexer = RawIndexer::new(stop_words); + let mut indexer = RawIndexer::new(&stop_words); let docid = DocumentId(0); let indexed_pos = IndexedPos(0); @@ -293,7 +301,8 @@ mod tests { #[test] fn no_empty_unidecode() { - let mut indexer = RawIndexer::new(fst::Set::default()); + let stop_words = fst::Set::default(); + let mut indexer = RawIndexer::new(&stop_words); let docid = DocumentId(0); let indexed_pos = IndexedPos(0); @@ -312,7 +321,8 @@ mod tests { #[test] // test sample from 807 fn very_long_text() { - let mut indexer = RawIndexer::new(fst::Set::default()); + let stop_words = fst::Set::default(); + let mut indexer = RawIndexer::new(&stop_words); let indexed_pos = IndexedPos(0); let docid = DocumentId(0); let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means “requests for example.com/”, while '/admin' means “requests for example.com/admin”. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:'/':# Rules for all requests that don't otherwise match....'/sites/default/files':# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:'/':passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:'/':root:'web'passthru:'/app.php'This block will serve requests to / from the web directory in the application, and if a file doesn’t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with “ms” (milliseconds), “s” (seconds), “m” (minutes), “h” (hours), “d” (days), “w” (weeks), “M” (months, 30d) or “y” (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don’t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren’t affected, to avoid overlap with the application’s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:'/':passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application’s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n "; @@ -325,7 +335,8 @@ mod tests { #[test] fn words_over_index_1000_not_indexed() { - let mut indexer = RawIndexer::new(fst::Set::default()); + let stop_words = fst::Set::default(); + let mut indexer = RawIndexer::new(&stop_words); let indexed_pos = IndexedPos(0); let docid = DocumentId(0); let mut text = String::with_capacity(5000); diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index fc999a6cb..00fdd5122 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -110,12 +110,12 @@ pub fn push_documents_addition( } #[allow(clippy::too_many_arguments)] -fn index_document( +fn index_document>( writer: &mut heed::RwTxn, documents_fields: DocumentsFields, documents_fields_counts: DocumentsFieldsCounts, ranked_map: &mut RankedMap, - indexer: &mut RawIndexer, + indexer: &mut RawIndexer, schema: &Schema, field_id: FieldId, document_id: DocumentId, @@ -221,7 +221,7 @@ pub fn apply_addition( let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?; - let mut indexer = RawIndexer::new(stop_words); + let mut indexer = RawIndexer::new(&stop_words); // For each document in this update for (document_id, document) in &documents_additions { @@ -316,7 +316,7 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind .unwrap(); let number_of_inserted_documents = documents_ids_to_reindex.len(); - let mut indexer = RawIndexer::new(stop_words); + let mut indexer = RawIndexer::new(&stop_words); let mut ram_store = HashMap::new(); if let Some(ref attributes_for_facetting) = index.main.attributes_for_faceting(writer)? { @@ -372,12 +372,12 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind Ok(()) } -pub fn write_documents_addition_index( +pub fn write_documents_addition_index>( writer: &mut heed::RwTxn, index: &store::Index, ranked_map: &RankedMap, number_of_inserted_documents: usize, - indexer: RawIndexer, + indexer: RawIndexer, ) -> MResult<()> { let indexed = indexer.build(); diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index 951480ee1..8d9ff633c 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -12,8 +12,8 @@ use crate::serde::SerializerError; use crate::store::DiscoverIds; /// Returns the number of words indexed or `None` if the type is unindexable. -pub fn index_value( - indexer: &mut RawIndexer, +pub fn index_value>( + indexer: &mut RawIndexer, document_id: DocumentId, indexed_pos: IndexedPos, value: &Value, From c6434f609c37b20eca87cd4036fa97ebf99b0cbe Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 20:01:53 +0100 Subject: [PATCH 07/22] fix indexing length --- meilisearch-core/src/query_tree.rs | 18 +++---- meilisearch-core/src/raw_indexer.rs | 56 ++++++++++---------- meilisearch-http/tests/placeholder_search.rs | 2 + 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index aae80e395..d9473f301 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -181,27 +181,25 @@ fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set) analyzer .analyze(s) .tokens() - .scan((0, None), |(offset, sepcat), mut token| { + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) - .enumerate() + .filter(|(_, t)| t.is_word()) .map(|(i, Token { word, .. })| (i, word.to_string())) .collect() } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 0266772f6..fd8a68a43 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -50,31 +50,32 @@ where let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (word_pos, token) in analyzed_text.tokens() - .scan((0, None), |(offset, sepcat), mut token| { + for (token_pos, (word_pos, token)) in analyzed_text + .tokens() + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + TokenKind::Word => { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) + .filter(|(_, t)| t.is_word()) .enumerate() { let must_continue = index_token( token, word_pos, + token_pos, id, indexed_pos, self.word_limit, @@ -106,41 +107,41 @@ where let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .scan((0, None), |(offset, sepcat), mut token| { + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) - .map(|mut t| { + .filter(|(_, t)| t.is_word()) + .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - t + (i, t) }) - .enumerate() - .map(|(i, t)| (i + current_word_offset, t)); + .map(|(i, t)| (i + current_word_offset, t)) + .enumerate(); - for (word_pos, token) in tokens { + for (token_pos, (word_pos, token)) in tokens { word_offset = word_pos + 1; byte_offset = token.byte_end + 1; let must_continue = index_token( token, word_pos, + token_pos, id, indexed_pos, self.word_limit, @@ -183,6 +184,7 @@ where fn index_token( token: Token, word_pos: usize, + token_pos: usize, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, @@ -190,7 +192,7 @@ fn index_token( docs_words: &mut HashMap>, ) -> bool { - if word_pos >= word_limit { + if token_pos >= word_limit { return false; } @@ -330,7 +332,7 @@ mod tests { let Indexed { words_doc_indexes, .. } = indexer.build(); - assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some()); + assert!(words_doc_indexes.get(&"request".to_owned().into_bytes()).is_some()); } #[test] diff --git a/meilisearch-http/tests/placeholder_search.rs b/meilisearch-http/tests/placeholder_search.rs index 048ab7f8b..fb1286248 100644 --- a/meilisearch-http/tests/placeholder_search.rs +++ b/meilisearch-http/tests/placeholder_search.rs @@ -102,6 +102,8 @@ async fn placeholder_search_witch_crop() { "cropLength": 20 }); + println!("here"); + test_post_get_search!(server, query, |response, status_code| { assert_eq!(status_code, 200); From b574960755e5c006c763ef036f20bfb49fa62b8b Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 20:19:15 +0100 Subject: [PATCH 08/22] fix split_query_string --- meilisearch-core/src/database.rs | 1 + meilisearch-core/src/query_tree.rs | 23 +++-------------------- meilisearch-core/src/raw_indexer.rs | 4 ++-- 3 files changed, 6 insertions(+), 22 deletions(-) diff --git a/meilisearch-core/src/database.rs b/meilisearch-core/src/database.rs index da8d44d6a..53faacb50 100644 --- a/meilisearch-core/src/database.rs +++ b/meilisearch-core/src/database.rs @@ -882,6 +882,7 @@ mod tests { // even try to search for a document let reader = db.main_read_txn().unwrap(); + println!("here"); let SortResult {documents, .. } = index.query_builder().query(&reader, Some("21 "), 0..20).unwrap(); assert_matches!(documents.len(), 1); diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index d9473f301..9ecd38f0f 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -181,26 +181,9 @@ fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set) analyzer .analyze(s) .tokens() - .scan((0, false), |(offset, is_hard_sep), mut token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if *is_hard_sep { - *offset += 8; - } else { - *offset += 1; - } - *is_hard_sep = false; - token.char_index += *offset; - } - TokenKind::Separator(SeparatorKind::Hard) => { - *is_hard_sep = true; - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) - .map(|(i, Token { word, .. })| (i, word.to_string())) + .filter(|t| t.is_word()) + .map(| Token { word, .. }| word.to_string()) + .enumerate() .collect() } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index fd8a68a43..a6bff7f0c 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -55,13 +55,13 @@ where .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word => { + token.char_index += *offset; if *is_hard_sep { *offset += 8; } else { *offset += 1; } *is_hard_sep = false; - token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { *is_hard_sep = true; @@ -110,13 +110,13 @@ where .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + token.char_index += *offset; if *is_hard_sep { *offset += 8; } else { *offset += 1; } *is_hard_sep = false; - token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { *is_hard_sep = true; From db64e19b8d79ac3b535011ebff17d170969104ea Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 2 Dec 2020 15:21:24 +0100 Subject: [PATCH 09/22] all tests pass --- meilisearch-core/src/query_tree.rs | 2 +- meilisearch-core/src/raw_indexer.rs | 83 ++++++++++++++--------------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 9ecd38f0f..e1485566e 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; +use meilisearch_tokenizer::Token; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index a6bff7f0c..8ed709324 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -46,32 +46,12 @@ where } } + pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (token_pos, (word_pos, token)) in analyzed_text - .tokens() - .scan((0, false), |(offset, is_hard_sep), mut token| { - match token.kind { - TokenKind::Word => { - token.char_index += *offset; - if *is_hard_sep { - *offset += 8; - } else { - *offset += 1; - } - *is_hard_sep = false; - } - TokenKind::Separator(SeparatorKind::Hard) => { - *is_hard_sep = true; - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) - .enumerate() { + for (token_pos, (word_pos, token)) in process_tokens(analyzed_text.tokens()).enumerate() { let must_continue = index_token( token, word_pos, @@ -105,27 +85,7 @@ where let current_word_offset = word_offset; let analyzed_text = self.analyzer.analyze(s); - let tokens = analyzed_text - .tokens() - .scan((0, false), |(offset, is_hard_sep), mut token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - token.char_index += *offset; - if *is_hard_sep { - *offset += 8; - } else { - *offset += 1; - } - *is_hard_sep = false; - } - TokenKind::Separator(SeparatorKind::Hard) => { - *is_hard_sep = true; - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) + let tokens = process_tokens(analyzed_text.tokens()) .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; @@ -181,6 +141,31 @@ where } } +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .scan((0, None), |(offset, sepkind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + *offset += match *sepkind { + Some(SeparatorKind::Hard) => 8, + Some(SeparatorKind::Soft) => 1, + None => 0, + }; + *sepkind = None; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepkind = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { + *sepkind = Some(SeparatorKind::Soft); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn index_token( token: Token, word_pos: usize, @@ -236,6 +221,18 @@ fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, wor mod tests { use super::*; use meilisearch_schema::IndexedPos; + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use fst::Set; + + #[test] + fn test_process_token() { + let text = " Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + let stopwords = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); + let analyzer = analyzer.analyze(text); + let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect(); + println!("tokens: {:?}", tokens); + } #[test] fn strange_apostrophe() { From a7c88c7951e8e07a3366f3fdbbd2fe429ed4870b Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 2 Dec 2020 18:55:39 +0100 Subject: [PATCH 10/22] restore synonyms tests --- meilisearch-core/src/database.rs | 1 - meilisearch-core/src/query_builder.rs | 2289 +++++++++++++------------ meilisearch-core/src/raw_indexer.rs | 10 +- 3 files changed, 1165 insertions(+), 1135 deletions(-) diff --git a/meilisearch-core/src/database.rs b/meilisearch-core/src/database.rs index 53faacb50..da8d44d6a 100644 --- a/meilisearch-core/src/database.rs +++ b/meilisearch-core/src/database.rs @@ -882,7 +882,6 @@ mod tests { // even try to search for a document let reader = db.main_read_txn().unwrap(); - println!("here"); let SortResult {documents, .. } = index.query_builder().query(&reader, Some("21 "), 0..20).unwrap(); assert_matches!(documents.len(), 1); diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index f8c55189e..201610d44 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -282,1132 +282,1163 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> { } } -//#[cfg(test)] -//mod tests { - //use super::*; - - //use std::collections::{BTreeSet, HashMap}; - //use std::iter::FromIterator; - - //use fst::IntoStreamer; - //use meilisearch_schema::IndexedPos; - //use sdset::SetBuf; - //use tempfile::TempDir; - - //use crate::automaton::normalize_str; - //use crate::bucket_sort::SimpleMatch; - //use crate::database::{Database, DatabaseOptions}; - //use crate::store::Index; - //use crate::DocIndex; - //use crate::Document; - //use meilisearch_schema::Schema; - - //fn set_from_stream<'f, I, S>(stream: I) -> fst::Set> - //where - //I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, - //S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, - //{ - //let mut builder = fst::SetBuilder::memory(); - //builder.extend_stream(stream).unwrap(); - //builder.into_set() - //} - - //fn insert_key>(set: &fst::Set, key: &[u8]) -> fst::Set> { - //let unique_key = { - //let mut builder = fst::SetBuilder::memory(); - //builder.insert(key).unwrap(); - //builder.into_set() - //}; - - //let union_ = set.op().add(unique_key.into_stream()).r#union(); - - //set_from_stream(union_) - //} - - //fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set> { - //let mut builder = fst::SetBuilder::memory(); - //let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); - //builder.extend_iter(set.into_iter()).unwrap(); - //builder.into_set() - //} - - //const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { - //DocIndex { - //document_id: DocumentId(document_id), - //attribute: 0, - //word_index, - //char_index: 0, - //char_length: 0, - //} - //} - - //const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex { - //DocIndex { - //document_id: DocumentId(document_id), - //attribute: 0, - //word_index, - //char_index, - //char_length: 0, - //} - //} - - //pub struct TempDatabase { - //database: Database, - //index: Index, - //_tempdir: TempDir, - //} - - //impl TempDatabase { - //pub fn query_builder(&self) -> QueryBuilder { - //self.index.query_builder() - //} - - //pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { - //let db = &self.database; - //let mut writer = db.main_write_txn().unwrap(); - - //let word = normalize_str(word); - - //let alternatives = self - //.index - //.synonyms - //.synonyms_fst(&writer, word.as_bytes()) - //.unwrap(); - - //let new = sdset_into_fstset(&new); - //let new_alternatives = - //set_from_stream(alternatives.op().add(new.into_stream()).r#union()); - //self.index - //.synonyms - //.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) - //.unwrap(); - - //let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); - - //let synonyms_fst = insert_key(&synonyms, word.as_bytes()); - //self.index - //.main - //.put_synonyms_fst(&mut writer, &synonyms_fst) - //.unwrap(); - - //writer.commit().unwrap(); - //} - //} - - //impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { - //fn from_iter>(iter: I) -> Self { - //let tempdir = TempDir::new().unwrap(); - //let database = Database::open_or_create(&tempdir, DatabaseOptions::default()).unwrap(); - //let index = database.create_index("default").unwrap(); - - //let db = &database; - //let mut writer = db.main_write_txn().unwrap(); - - //let mut words_fst = BTreeSet::new(); - //let mut postings_lists = HashMap::new(); - //let mut fields_counts = HashMap::<_, u16>::new(); - - //let mut schema = Schema::with_primary_key("id"); - - //for (word, indexes) in iter { - //let mut final_indexes = Vec::new(); - //for index in indexes { - //let name = index.attribute.to_string(); - //schema.insert(&name).unwrap(); - //let indexed_pos = schema.set_indexed(&name).unwrap().1; - //let index = DocIndex { - //attribute: indexed_pos.0, - //..*index - //}; - //final_indexes.push(index); - //} - - //let word = word.to_lowercase().into_bytes(); - //words_fst.insert(word.clone()); - //postings_lists - //.entry(word) - //.or_insert_with(Vec::new) - //.extend_from_slice(&final_indexes); - //for idx in final_indexes { - //fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); - //} - //} - - //index.main.put_schema(&mut writer, &schema).unwrap(); - - //let words_fst = fst::Set::from_iter(words_fst).unwrap(); - - //index.main.put_words_fst(&mut writer, &words_fst).unwrap(); - - //for (word, postings_list) in postings_lists { - //let postings_list = SetBuf::from_dirty(postings_list); - //index - //.postings_lists - //.put_postings_list(&mut writer, &word, &postings_list) - //.unwrap(); - //} - - //for ((docid, attr, _), count) in fields_counts { - //let prev = index - //.documents_fields_counts - //.document_field_count(&writer, docid, IndexedPos(attr)) - //.unwrap(); - - //let prev = prev.unwrap_or(0); - - //index - //.documents_fields_counts - //.put_document_field_count(&mut writer, docid, IndexedPos(attr), prev + count) - //.unwrap(); - //} - - //writer.commit().unwrap(); - - //TempDatabase { database, index, _tempdir: tempdir } - //} - //} - - //#[test] - //fn simple() { - //let store = TempDatabase::from_iter(vec![ - //("iphone", &[doc_char_index(0, 0, 0)][..]), - //("from", &[doc_char_index(0, 1, 1)][..]), - //("apple", &[doc_char_index(0, 2, 2)][..]), - //]); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("iphone from apple"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn simple_synonyms() { - //let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - //store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //// #[test] - //// fn prefix_synonyms() { - //// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - //// store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); - //// store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); - - //// let db = &store.database; - //// let reader = db.main_read_txn().unwrap(); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "sal", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //// let mut matches = matches.into_iter(); - //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //// assert_matches!(matches.next(), None); - //// }); - //// assert_matches!(iter.next(), None); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "bonj", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //// let mut matches = matches.into_iter(); - //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //// assert_matches!(matches.next(), None); - //// }); - //// assert_matches!(iter.next(), None); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "sal blabla", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), None); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "bonj blabla", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), None); - //// } - - //// #[test] - //// fn levenshtein_synonyms() { - //// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); - - //// store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); - - //// let db = &store.database; - //// let reader = db.main_read_txn().unwrap(); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "salutution", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //// let mut matches = matches.into_iter(); - //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //// assert_matches!(matches.next(), None); - //// }); - //// assert_matches!(iter.next(), None); - - //// let builder = store.query_builder(); - //// let results = builder.query(&reader, "saluttion", 0..20).unwrap(); - //// let mut iter = documents.into_iter(); - - //// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //// let mut matches = matches.into_iter(); - //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //// assert_matches!(matches.next(), None); - //// }); - //// assert_matches!(iter.next(), None); - //// } - - //#[test] - //fn harder_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("hello", &[doc_index(0, 0)][..]), - //("bonjour", &[doc_index(1, 3)]), - //("salut", &[doc_index(2, 5)]), - //]); - - //store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); - //store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); - //store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("salut"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - ///// Unique word has multi-word synonyms - //fn unique_to_multiword_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("new", &[doc_char_index(0, 0, 0)][..]), - //("york", &[doc_char_index(0, 1, 1)][..]), - //("city", &[doc_char_index(0, 2, 2)][..]), - //("subway", &[doc_char_index(0, 3, 3)][..]), - //("NY", &[doc_char_index(1, 0, 0)][..]), - //("subway", &[doc_char_index(1, 1, 1)][..]), - //]); - - //store.add_synonym( - //"NY", - //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - //); - //store.add_synonym( - //"NYC", - //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - //); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn unique_to_multiword_synonyms_words_proximity() { - //let mut store = TempDatabase::from_iter(vec![ - //("new", &[doc_char_index(0, 0, 0)][..]), - //("york", &[doc_char_index(0, 1, 1)][..]), - //("city", &[doc_char_index(0, 2, 2)][..]), - //("subway", &[doc_char_index(0, 3, 3)][..]), - //("york", &[doc_char_index(1, 0, 0)][..]), - //("new", &[doc_char_index(1, 1, 1)][..]), - //("subway", &[doc_char_index(1, 2, 2)][..]), - //("NY", &[doc_char_index(2, 0, 0)][..]), - //("subway", &[doc_char_index(2, 1, 1)][..]), - //]); - - //store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NY"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("new york"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york - //assert_matches!(matches.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn unique_to_multiword_synonyms_cumulative_word_index() { - //let mut store = TempDatabase::from_iter(vec![ - //("NY", &[doc_char_index(0, 0, 0)][..]), - //("subway", &[doc_char_index(0, 1, 1)][..]), - //("new", &[doc_char_index(1, 0, 0)][..]), - //("york", &[doc_char_index(1, 1, 1)][..]), - //("subway", &[doc_char_index(1, 2, 2)][..]), - //]); - - //store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //// assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //// let mut matches = matches.into_iter(); - //// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway - //// assert_matches!(matches.next(), None); - //// }); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = - //builder.query(&reader, Some("new york subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - ///// Unique word has multi-word synonyms - //fn harder_unique_to_multiword_synonyms_one() { - //let mut store = TempDatabase::from_iter(vec![ - //("new", &[doc_char_index(0, 0, 0)][..]), - //("york", &[doc_char_index(0, 1, 1)][..]), - //("city", &[doc_char_index(0, 2, 2)][..]), - //("yellow", &[doc_char_index(0, 3, 3)][..]), - //("subway", &[doc_char_index(0, 4, 4)][..]), - //("broken", &[doc_char_index(0, 5, 5)][..]), - //("NY", &[doc_char_index(1, 0, 0)][..]), - //("blue", &[doc_char_index(1, 1, 1)][..]), - //("subway", &[doc_char_index(1, 2, 2)][..]), - //]); - - //store.add_synonym( - //"NY", - //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - //); - //store.add_synonym( - //"NYC", - //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - //); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC - //// because one-word to one-word ^^^^ - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - ///// Unique word has multi-word synonyms - //fn even_harder_unique_to_multiword_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("new", &[doc_char_index(0, 0, 0)][..]), - //("york", &[doc_char_index(0, 1, 1)][..]), - //("city", &[doc_char_index(0, 2, 2)][..]), - //("yellow", &[doc_char_index(0, 3, 3)][..]), - //("underground", &[doc_char_index(0, 4, 4)][..]), - //("train", &[doc_char_index(0, 5, 5)][..]), - //("broken", &[doc_char_index(0, 6, 6)][..]), - //("NY", &[doc_char_index(1, 0, 0)][..]), - //("blue", &[doc_char_index(1, 1, 1)][..]), - //("subway", &[doc_char_index(1, 2, 2)][..]), - //]); - - //store.add_synonym( - //"NY", - //SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), - //); - //store.add_synonym( - //"NYC", - //SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), - //); - //store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult {documents, .. } = builder.query(&reader, Some("NY subway broken"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway - //assert_matches!(iter.next(), None); // position rewritten ^ - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - //// because one-word to one-word ^^^^ - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - ///// Multi-word has multi-word synonyms - //fn multiword_to_multiword_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("NY", &[doc_char_index(0, 0, 0)][..]), - //("subway", &[doc_char_index(0, 1, 1)][..]), - //("NYC", &[doc_char_index(1, 0, 0)][..]), - //("blue", &[doc_char_index(1, 1, 1)][..]), - //("subway", &[doc_char_index(1, 2, 2)][..]), - //("broken", &[doc_char_index(1, 3, 3)][..]), - //("new", &[doc_char_index(2, 0, 0)][..]), - //("york", &[doc_char_index(2, 1, 1)][..]), - //("underground", &[doc_char_index(2, 2, 2)][..]), - //("train", &[doc_char_index(2, 3, 3)][..]), - //("broken", &[doc_char_index(2, 4, 4)][..]), - //]); - - //store.add_synonym( - //"new york", - //SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]), - //); - //store.add_synonym( - //"new york city", - //SetBuf::from_dirty(vec!["NYC", "NY", "new york"]), - //); - //store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder - //.query(&reader, Some("new york underground train broken"), 0..20) - //.unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder - //.query(&reader, Some("new york city underground train broken"), 0..20) - //.unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn intercrossed_multiword_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("new", &[doc_index(0, 0)][..]), - //("york", &[doc_index(0, 1)][..]), - //("big", &[doc_index(0, 2)][..]), - //("city", &[doc_index(0, 3)][..]), - //]); - - //store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"])); - //store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("new york big "), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let mut store = TempDatabase::from_iter(vec![ - //("NY", &[doc_index(0, 0)][..]), - //("city", &[doc_index(0, 1)][..]), - //("subway", &[doc_index(0, 2)][..]), - //("NY", &[doc_index(1, 0)][..]), - //("subway", &[doc_index(1, 1)][..]), - //("NY", &[doc_index(2, 0)][..]), - //("york", &[doc_index(2, 1)][..]), - //("city", &[doc_index(2, 2)][..]), - //("subway", &[doc_index(2, 3)][..]), - //]); - - //store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("NY subway "), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn cumulative_word_indices() { - //let mut store = TempDatabase::from_iter(vec![ - //("NYC", &[doc_index(0, 0)][..]), - //("long", &[doc_index(0, 1)][..]), - //("subway", &[doc_index(0, 2)][..]), - //("cool", &[doc_index(0, 3)][..]), - //]); - - //store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); - //store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder - //.query(&reader, Some("new york city long subway cool "), 0..20) - //.unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut matches = matches.into_iter(); - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train - //assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool - //assert_matches!(matches.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn deunicoded_synonyms() { - //let mut store = TempDatabase::from_iter(vec![ - //("telephone", &[doc_index(0, 0)][..]), // meilisearch indexes the unidecoded - //("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex - //("iphone", &[doc_index(1, 0)][..]), - //]); - - //store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("téléphone"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn simple_concatenation() { - //let store = TempDatabase::from_iter(vec![ - //("iphone", &[doc_index(0, 0)][..]), - //("case", &[doc_index(0, 1)][..]), - //]); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("i phone case"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone - //// assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone" - //// but no typo on first letter ^^^^^^^ - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn exact_field_count_one_word() { - //let store = TempDatabase::from_iter(vec![ - //("searchengine", &[doc_index(0, 0)][..]), - //("searchengine", &[doc_index(1, 0)][..]), - //("blue", &[doc_index(1, 1)][..]), - //("searchangine", &[doc_index(2, 0)][..]), - //("searchengine", &[doc_index(3, 0)][..]), - //]); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn simple_phrase_query_splitting() { - //let store = TempDatabase::from_iter(vec![ - //("search", &[doc_index(0, 0)][..]), - //("engine", &[doc_index(0, 1)][..]), - //("search", &[doc_index(1, 0)][..]), - //("slow", &[doc_index(1, 1)][..]), - //("engine", &[doc_index(1, 2)][..]), - //]); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} - - //#[test] - //fn harder_phrase_query_splitting() { - //let store = TempDatabase::from_iter(vec![ - //("search", &[doc_index(0, 0)][..]), - //("search", &[doc_index(0, 1)][..]), - //("engine", &[doc_index(0, 2)][..]), - //("search", &[doc_index(1, 0)][..]), - //("slow", &[doc_index(1, 1)][..]), - //("search", &[doc_index(1, 2)][..]), - //("engine", &[doc_index(1, 3)][..]), - //("search", &[doc_index(1, 0)][..]), - //("search", &[doc_index(1, 1)][..]), - //("slow", &[doc_index(1, 2)][..]), - //("engine", &[doc_index(1, 3)][..]), - //]); - - //let db = &store.database; - //let reader = db.main_read_txn().unwrap(); - - //let builder = store.query_builder(); - //let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); - //let mut iter = documents.into_iter(); - - //assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - //let mut iter = matches.into_iter(); - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search - //assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine - //assert_matches!(iter.next(), None); - //}); - //assert_matches!(iter.next(), None); - //} -//} +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::{BTreeSet, HashMap}; + use std::iter::FromIterator; + + use fst::IntoStreamer; + use meilisearch_schema::IndexedPos; + use sdset::SetBuf; + use tempfile::TempDir; + + use crate::bucket_sort::SimpleMatch; + use crate::database::{Database, DatabaseOptions}; + use crate::store::Index; + use crate::DocIndex; + use crate::Document; + use meilisearch_schema::Schema; + + fn is_cjk(c: char) -> bool { + (c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo + || (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement + || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical + || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation + || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana + || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana + || (c >= '\u{3100}' && c <= '\u{312f}') + || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo + || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months + || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A + || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs + || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A + || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables + || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B + || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs + || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana + } + + fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string + } + + fn set_from_stream<'f, I, S>(stream: I) -> fst::Set> + where + I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, + { + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(stream).unwrap(); + builder.into_set() + } + + fn insert_key>(set: &fst::Set, key: &[u8]) -> fst::Set> { + let unique_key = { + let mut builder = fst::SetBuilder::memory(); + builder.insert(key).unwrap(); + builder.into_set() + }; + + let union_ = set.op().add(unique_key.into_stream()).r#union(); + + set_from_stream(union_) + } + + fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set> { + let mut builder = fst::SetBuilder::memory(); + let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); + builder.extend_iter(set.into_iter()).unwrap(); + builder.into_set() + } + + const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index: 0, + char_length: 0, + } + } + + const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index, + char_length: 0, + } + } + + pub struct TempDatabase { + database: Database, + index: Index, + _tempdir: TempDir, + } + + impl TempDatabase { + pub fn query_builder(&self) -> QueryBuilder { + self.index.query_builder() + } + + pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { + let db = &self.database; + let mut writer = db.main_write_txn().unwrap(); + + let word = normalize_str(word); + println!("synonym: {}", word); + + let alternatives = self + .index + .synonyms + .synonyms_fst(&writer, word.as_bytes()) + .unwrap(); + + let new = sdset_into_fstset(&new); + let new_alternatives = + set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + self.index + .synonyms + .put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) + .unwrap(); + + let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); + + let synonyms_fst = insert_key(&synonyms, word.as_bytes()); + self.index + .main + .put_synonyms_fst(&mut writer, &synonyms_fst) + .unwrap(); + + writer.commit().unwrap(); + } + } + + impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { + fn from_iter>(iter: I) -> Self { + let tempdir = TempDir::new().unwrap(); + let database = Database::open_or_create(&tempdir, DatabaseOptions::default()).unwrap(); + let index = database.create_index("default").unwrap(); + + let db = &database; + let mut writer = db.main_write_txn().unwrap(); + + let mut words_fst = BTreeSet::new(); + let mut postings_lists = HashMap::new(); + let mut fields_counts = HashMap::<_, u16>::new(); + + let mut schema = Schema::with_primary_key("id"); + + for (word, indexes) in iter { + let mut final_indexes = Vec::new(); + for index in indexes { + let name = index.attribute.to_string(); + schema.insert(&name).unwrap(); + let indexed_pos = schema.set_indexed(&name).unwrap().1; + let index = DocIndex { + attribute: indexed_pos.0, + ..*index + }; + final_indexes.push(index); + } + + let word = word.to_lowercase().into_bytes(); + words_fst.insert(word.clone()); + postings_lists + .entry(word) + .or_insert_with(Vec::new) + .extend_from_slice(&final_indexes); + for idx in final_indexes { + fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); + } + } + + index.main.put_schema(&mut writer, &schema).unwrap(); + + let words_fst = fst::Set::from_iter(words_fst).unwrap(); + + index.main.put_words_fst(&mut writer, &words_fst).unwrap(); + + for (word, postings_list) in postings_lists { + let postings_list = SetBuf::from_dirty(postings_list); + index + .postings_lists + .put_postings_list(&mut writer, &word, &postings_list) + .unwrap(); + } + + for ((docid, attr, _), count) in fields_counts { + let prev = index + .documents_fields_counts + .document_field_count(&writer, docid, IndexedPos(attr)) + .unwrap(); + + let prev = prev.unwrap_or(0); + + index + .documents_fields_counts + .put_document_field_count(&mut writer, docid, IndexedPos(attr), prev + count) + .unwrap(); + } + + writer.commit().unwrap(); + + TempDatabase { database, index, _tempdir: tempdir } + } + } + + #[test] + fn simple() { + let store = TempDatabase::from_iter(vec![ + ("iphone", &[doc_char_index(0, 0, 0)][..]), + ("from", &[doc_char_index(0, 1, 1)][..]), + ("apple", &[doc_char_index(0, 2, 2)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("iphone from apple"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn simple_synonyms() { + let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + // #[test] + // fn prefix_synonyms() { + // let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + // store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + // store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); + + // let db = &store.database; + // let reader = db.main_read_txn().unwrap(); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "sal", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + // let mut matches = matches.into_iter(); + // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + // assert_matches!(matches.next(), None); + // }); + // assert_matches!(iter.next(), None); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "bonj", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + // let mut matches = matches.into_iter(); + // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + // assert_matches!(matches.next(), None); + // }); + // assert_matches!(iter.next(), None); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "sal blabla", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), None); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "bonj blabla", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), None); + // } + + // #[test] + // fn levenshtein_synonyms() { + // let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); + + // store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); + + // let db = &store.database; + // let reader = db.main_read_txn().unwrap(); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "salutution", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + // let mut matches = matches.into_iter(); + // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + // assert_matches!(matches.next(), None); + // }); + // assert_matches!(iter.next(), None); + + // let builder = store.query_builder(); + // let results = builder.query(&reader, "saluttion", 0..20).unwrap(); + // let mut iter = documents.into_iter(); + + // assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + // let mut matches = matches.into_iter(); + // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + // assert_matches!(matches.next(), None); + // }); + // assert_matches!(iter.next(), None); + // } + + #[test] + fn harder_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("hello", &[doc_index(0, 0)][..]), + ("bonjour", &[doc_index(1, 3)]), + ("salut", &[doc_index(2, 5)]), + ]); + + store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); + store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("hello"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("bonjour"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("salut"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn unique_to_multiword_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("subway", &[doc_char_index(1, 1, 1)][..]), + ]); + + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_words_proximity() { + let mut store = TempDatabase::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), + ("york", &[doc_char_index(1, 0, 0)][..]), + ("new", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("NY", &[doc_char_index(2, 0, 0)][..]), + ("subway", &[doc_char_index(2, 1, 1)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NY"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("new york"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york + assert_matches!(matches.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_cumulative_word_index() { + let mut store = TempDatabase::from_iter(vec![ + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + ("new", &[doc_char_index(1, 0, 0)][..]), + ("york", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + // assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + // let mut matches = matches.into_iter(); + // assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + // assert_matches!(matches.next(), None); + // }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = + builder.query(&reader, Some("new york subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn harder_unique_to_multiword_synonyms_one() { + let mut store = TempDatabase::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("subway", &[doc_char_index(0, 4, 4)][..]), + ("broken", &[doc_char_index(0, 5, 5)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NY subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn even_harder_unique_to_multiword_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("underground", &[doc_char_index(0, 4, 4)][..]), + ("train", &[doc_char_index(0, 5, 5)][..]), + ("broken", &[doc_char_index(0, 6, 6)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult {documents, .. } = builder.query(&reader, Some("NY subway broken"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NYC subway"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Multi-word has multi-word synonyms + fn multiword_to_multiword_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + ("NYC", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("broken", &[doc_char_index(1, 3, 3)][..]), + ("new", &[doc_char_index(2, 0, 0)][..]), + ("york", &[doc_char_index(2, 1, 1)][..]), + ("underground", &[doc_char_index(2, 2, 2)][..]), + ("train", &[doc_char_index(2, 3, 3)][..]), + ("broken", &[doc_char_index(2, 4, 4)][..]), + ]); + + store.add_synonym( + "new york", + SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]), + ); + store.add_synonym( + "new york city", + SetBuf::from_dirty(vec!["NYC", "NY", "new york"]), + ); + store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder + .query(&reader, Some("new york underground train broken"), 0..20) + .unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder + .query(&reader, Some("new york city underground train broken"), 0..20) + .unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn intercrossed_multiword_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("new", &[doc_index(0, 0)][..]), + ("york", &[doc_index(0, 1)][..]), + ("big", &[doc_index(0, 2)][..]), + ("city", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"])); + store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("new york big "), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let mut store = TempDatabase::from_iter(vec![ + ("NY", &[doc_index(0, 0)][..]), + ("city", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + ("NY", &[doc_index(1, 0)][..]), + ("subway", &[doc_index(1, 1)][..]), + ("NY", &[doc_index(2, 0)][..]), + ("york", &[doc_index(2, 1)][..]), + ("city", &[doc_index(2, 2)][..]), + ("subway", &[doc_index(2, 3)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("NY subway "), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn cumulative_word_indices() { + let mut store = TempDatabase::from_iter(vec![ + ("NYC", &[doc_index(0, 0)][..]), + ("long", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + ("cool", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder + .query(&reader, Some("new york city long subway cool "), 0..20) + .unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn deunicoded_synonyms() { + let mut store = TempDatabase::from_iter(vec![ + ("telephone", &[doc_index(0, 0)][..]), // meilisearch indexes the unidecoded + ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex + ("iphone", &[doc_index(1, 0)][..]), + ]); + + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); + println!("documents: {:#?}", documents); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("téléphone"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + // this test was in the opposite order, I am not sure why... + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn simple_concatenation() { + let store = TempDatabase::from_iter(vec![ + ("iphone", &[doc_index(0, 0)][..]), + ("case", &[doc_index(0, 1)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("i phone case"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone + // assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone" + // but no typo on first letter ^^^^^^^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn exact_field_count_one_word() { + let store = TempDatabase::from_iter(vec![ + ("searchengine", &[doc_index(0, 0)][..]), + ("searchengine", &[doc_index(1, 0)][..]), + ("blue", &[doc_index(1, 1)][..]), + ("searchangine", &[doc_index(2, 0)][..]), + ("searchengine", &[doc_index(3, 0)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn simple_phrase_query_splitting() { + let store = TempDatabase::from_iter(vec![ + ("search", &[doc_index(0, 0)][..]), + ("engine", &[doc_index(0, 1)][..]), + ("search", &[doc_index(1, 0)][..]), + ("slow", &[doc_index(1, 1)][..]), + ("engine", &[doc_index(1, 2)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn harder_phrase_query_splitting() { + let store = TempDatabase::from_iter(vec![ + ("search", &[doc_index(0, 0)][..]), + ("search", &[doc_index(0, 1)][..]), + ("engine", &[doc_index(0, 2)][..]), + ("search", &[doc_index(1, 0)][..]), + ("slow", &[doc_index(1, 1)][..]), + ("search", &[doc_index(1, 2)][..]), + ("engine", &[doc_index(1, 3)][..]), + ("search", &[doc_index(1, 0)][..]), + ("search", &[doc_index(1, 1)][..]), + ("slow", &[doc_index(1, 2)][..]), + ("engine", &[doc_index(1, 3)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let SortResult { documents, .. } = builder.query(&reader, Some("searchengine"), 0..20).unwrap(); + let mut iter = documents.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } +} diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 8ed709324..6e9afc677 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -147,17 +147,17 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { *offset += match *sepkind { - Some(SeparatorKind::Hard) => 8, - Some(SeparatorKind::Soft) => 1, + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, None => 0, }; - *sepkind = None; + *sepkind = Some(token.kind) } TokenKind::Separator(SeparatorKind::Hard) => { - *sepkind = Some(SeparatorKind::Hard); + *sepkind = Some(token.kind); } TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { - *sepkind = Some(SeparatorKind::Soft); + *sepkind = Some(token.kind); } _ => (), } From 8b149c9aa3139ae35c8a98818140ba832e28c442 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 3 Dec 2020 12:34:08 +0100 Subject: [PATCH 11/22] update tokenizer dep to release --- Cargo.lock | 16 ++++++++++++++++ meilisearch-core/Cargo.toml | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f48920953..ced0c6de5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1696,6 +1696,22 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "meilisearch-tokenizer" +version = "0.1.0" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.0#32b2aee7e4a819478226eab5525232f09dd61bf5" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "meilisearch-types" version = "0.17.0" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index ecfecfdc1..3f074587f 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,7 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = { path = "../../Tokenizer" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.0" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } From 8e64a24d1923a97e20b85a05913edfc22c3322f7 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 3 Dec 2020 12:34:22 +0100 Subject: [PATCH 12/22] fix suggestions --- meilisearch-core/src/query_builder.rs | 3 -- meilisearch-core/src/raw_indexer.rs | 33 +++++++++----------- meilisearch-http/tests/placeholder_search.rs | 2 -- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 201610d44..b8cbe0ac6 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -395,7 +395,6 @@ mod tests { let mut writer = db.main_write_txn().unwrap(); let word = normalize_str(word); - println!("synonym: {}", word); let alternatives = self .index @@ -1261,7 +1260,6 @@ mod tests { let builder = store.query_builder(); let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); - println!("documents: {:#?}", documents); let mut iter = documents.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1297,7 +1295,6 @@ mod tests { let builder = store.query_builder(); let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); let mut iter = documents.into_iter(); - // this test was in the opposite order, I am not sure why... assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 6e9afc677..3a7519c90 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -14,10 +14,7 @@ const WORD_LENGTH_LIMIT: usize = 80; type Word = Vec; // TODO make it be a SmallVec -pub struct RawIndexer<'a, A> -where - A: AsRef<[u8]> -{ +pub struct RawIndexer<'a, A> { word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, docs_words: HashMap>, @@ -73,25 +70,24 @@ where number_of_words } - pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) + pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I) where I: IntoIterator, { let mut byte_offset = 0; let mut word_offset = 0; - for s in iter.into_iter() { + for text in text_iter.into_iter() { let current_byte_offset = byte_offset; let current_word_offset = word_offset; - let analyzed_text = self.analyzer.analyze(s); + let analyzed_text = self.analyzer.analyze(text); let tokens = process_tokens(analyzed_text.tokens()) .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - (i, t) + (i + current_word_offset, t) }) - .map(|(i, t)| (i + current_word_offset, t)) .enumerate(); for (token_pos, (word_pos, token)) in tokens { @@ -143,21 +139,22 @@ where fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { tokens - .scan((0, None), |(offset, sepkind), token| { + .scan((0, None), |(offset, prev_kind), token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - *offset += match *sepkind { + *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, None => 0, }; - *sepkind = Some(token.kind) + *prev_kind = Some(token.kind) } TokenKind::Separator(SeparatorKind::Hard) => { - *sepkind = Some(token.kind); + *prev_kind = Some(token.kind); } - TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { - *sepkind = Some(token.kind); + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); } _ => (), } @@ -226,12 +223,12 @@ mod tests { #[test] fn test_process_token() { - let text = " Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + let text = " 為一包含一千多萬目詞的帶標記平衡語料庫"; let stopwords = Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); let analyzer = analyzer.analyze(text); - let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect(); - println!("tokens: {:?}", tokens); + let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect(); + assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目", "词", "的", "带", "标记", "平衡", "语料库"]); } #[test] diff --git a/meilisearch-http/tests/placeholder_search.rs b/meilisearch-http/tests/placeholder_search.rs index fb1286248..048ab7f8b 100644 --- a/meilisearch-http/tests/placeholder_search.rs +++ b/meilisearch-http/tests/placeholder_search.rs @@ -102,8 +102,6 @@ async fn placeholder_search_witch_crop() { "cropLength": 20 }); - println!("here"); - test_post_get_search!(server, query, |response, status_code| { assert_eq!(status_code, 200); From 398577f116bbadc682ecc3ab3960c70a021c5bbf Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 8 Dec 2020 12:38:45 +0100 Subject: [PATCH 13/22] bump tokenizer --- Cargo.lock | 4 ++-- meilisearch-core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ced0c6de5..a5f65b5fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1698,8 +1698,8 @@ dependencies = [ [[package]] name = "meilisearch-tokenizer" -version = "0.1.0" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.0#32b2aee7e4a819478226eab5525232f09dd61bf5" +version = "0.1.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.1#dedea5df4b52d94216a65091f237ac64673bab09" dependencies = [ "character_converter", "cow-utils", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 3f074587f..8fbcba67b 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,7 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.0" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.1" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } From 808be4678ab9f1485971aeba079a8b30b0da4faa Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 8 Dec 2020 16:03:05 +0100 Subject: [PATCH 14/22] fix style --- meilisearch-core/src/query_builder.rs | 4 +++- meilisearch-core/src/query_tree.rs | 6 ++---- meilisearch-core/src/raw_indexer.rs | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index b8cbe0ac6..64203a6c7 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -185,7 +185,9 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> { None => { match self.index.main.sorted_document_ids_cache(reader)? { // build result from cached document ids - Some(docids) => { let mut sort_result = self.sort_result_from_docids(&docids, range); + Some(docids) => { + let mut sort_result = self.sort_result_from_docids(&docids, range); + if let Some(f) = self.facet_count_docids(reader)? { sort_result.exhaustive_facets_count = Some(true); // document ids are not sorted in natural order, we need to construct a new set diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index e1485566e..c2f43818f 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -8,7 +8,6 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::Token; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; @@ -177,12 +176,11 @@ const MAX_NGRAM: usize = 3; fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set) -> Vec<(usize, String)> { // TODO: Use global instance instead - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); - analyzer + Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)) .analyze(s) .tokens() .filter(|t| t.is_word()) - .map(| Token { word, .. }| word.to_string()) + .map(|t| t.word.to_string()) .enumerate() .collect() } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 3a7519c90..f54210ed6 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -43,7 +43,6 @@ where } } - pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; From 748a8240dde06c92088586d4f6d0301e65694673 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 14 Dec 2020 18:23:13 +0100 Subject: [PATCH 15/22] fix highlight shifting bug --- meilisearch-core/src/raw_indexer.rs | 4 ++-- meilisearch-http/src/helpers/meilisearch.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index f54210ed6..c78d1f4ff 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -199,8 +199,8 @@ fn index_token( fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option { let word_index = u16::try_from(word_index).ok()?; - let char_index = u16::try_from(token.char_index).ok()?; - let char_length = u16::try_from(token.word.chars().count()).ok()?; + let char_index = u16::try_from(token.byte_start).ok()?; + let char_length = u16::try_from(token.word.len()).ok()?; let docindex = DocIndex { document_id: id, diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index dd5e2c79f..2db6472ca 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -479,7 +479,7 @@ fn calculate_highlights( for (attribute, matches) in matches.iter() { if attributes_to_highlight.contains(attribute) { if let Some(Value::String(value)) = document.get(attribute) { - let value: Vec<_> = value.chars().collect(); + let value = value.clone(); let mut highlighted_value = String::new(); let mut index = 0; @@ -492,16 +492,16 @@ fn calculate_highlights( let before = value.get(index..m.start); let highlighted = value.get(m.start..(m.start + m.length)); if let (Some(before), Some(highlighted)) = (before, highlighted) { - highlighted_value.extend(before); + highlighted_value.push_str(before); highlighted_value.push_str(""); - highlighted_value.extend(highlighted); + highlighted_value.push_str(highlighted); highlighted_value.push_str(""); index = m.start + m.length; } else { error!("value: {:?}; index: {:?}, match: {:?}", value, index, m); } } - highlighted_value.extend(value[index..].iter()); + highlighted_value.push_str(&value[index..]); highlight_result.insert(attribute.to_string(), Value::String(highlighted_value)); }; } From 0447594e027cdd559f98849bb8480f28501617ac Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Dec 2020 15:13:58 +0100 Subject: [PATCH 16/22] add search test on chinese scripts --- meilisearch-http/src/helpers/meilisearch.rs | 3 +- meilisearch-http/tests/assets/test_set.json | 14 ++--- meilisearch-http/tests/search.rs | 60 +++++++++++++++++++++ 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 2db6472ca..57186a2d8 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -597,7 +597,7 @@ mod tests { let mut m = Vec::new(); m.push(MatchPosition { - start: 510, + start: 529, length: 9, }); matches.insert("description".to_string(), m); @@ -613,6 +613,7 @@ mod tests { assert_eq!(result, result_expected); } + #[test] fn highlight_longest_match() { let data = r#"{ diff --git a/meilisearch-http/tests/assets/test_set.json b/meilisearch-http/tests/assets/test_set.json index 63534c896..b035ac334 100644 --- a/meilisearch-http/tests/assets/test_set.json +++ b/meilisearch-http/tests/assets/test_set.json @@ -1590,18 +1590,18 @@ "tags": [] }, { - "id": 76, + "id": 77, "isActive": false, "balance": "$1,274.29", "picture": "http://placehold.it/32x32", "age": 25, - "color": "Green", - "name": "Clarice Gardner", - "gender": "female", - "email": "claricegardner@chorizon.com", + "color": "Red", + "name": "孙武", + "gender": "male", + "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", - "address": "894 Brooklyn Road, Utting, New Hampshire, 6404", - "about": "Elit occaecat aute ea adipisicing mollit cupidatat aliquip excepteur veniam minim. Sunt quis dolore in commodo aute esse quis. Lorem in cillum commodo eu anim commodo mollit. Adipisicing enim sunt adipisicing cupidatat adipisicing eiusmod eu do sit nisi.\r\n", + "address": "吴国", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197, diff --git a/meilisearch-http/tests/search.rs b/meilisearch-http/tests/search.rs index 6a496809e..82804a019 100644 --- a/meilisearch-http/tests/search.rs +++ b/meilisearch-http/tests/search.rs @@ -358,6 +358,66 @@ async fn search_with_attribute_to_highlight_wildcard() { }); } +#[actix_rt::test] +async fn search_with_attribute_to_highlight_wildcard_chinese() { + let mut server = common::Server::test_server().await; + + let query = json!({ + "q": "子孙", + "limit": 1, + "attributesToHighlight": ["*"] + }); + + let expected = json!([ + { + "id": 77, + "isActive": false, + "balance": "$1,274.29", + "picture": "http://placehold.it/32x32", + "age": 25, + "color": "Red", + "name": "孙武", + "gender": "male", + "email": "SunTzu@chorizon.com", + "phone": "+1 (810) 407-3258", + "address": "吴国", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", + "registered": "2014-10-20T10:13:32 -02:00", + "latitude": 17.11935, + "longitude": 65.38197, + "tags": [ + "new issue", + "wontfix" + ], + "_formatted": { + "id": 77, + "isActive": false, + "balance": "$1,274.29", + "picture": "http://placehold.it/32x32", + "age": 25, + "color": "Red", + "name": "孙武", + "gender": "male", + "email": "SunTzu@chorizon.com", + "phone": "+1 (810) 407-3258", + "address": "吴国", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安氏始祖,次子孙明为富春氏始祖。\r\n", + "registered": "2014-10-20T10:13:32 -02:00", + "latitude": 17.11935, + "longitude": 65.38197, + "tags": [ + "new issue", + "wontfix" + ] + } + } + ]); + + test_post_get_search!(server, query, |response, _status_code| { + assert_json_eq!(expected.clone(), response["hits"].clone(), ordered: false); + }); +} + #[actix_rt::test] async fn search_with_attribute_to_highlight_1() { let mut server = common::Server::test_server().await; From 2852349e68b8b03a8567ba2cf2547f3626c03924 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Dec 2020 16:31:31 +0100 Subject: [PATCH 17/22] update tokenizer version --- Cargo.lock | 2 +- meilisearch-core/Cargo.toml | 2 +- meilisearch-core/src/raw_indexer.rs | 4 ++-- meilisearch-http/tests/search.rs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5f65b5fb..50bb4e9d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1699,7 +1699,7 @@ dependencies = [ [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.1#dedea5df4b52d94216a65091f237ac64673bab09" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.2#8d91cd52f30aa4b651a085c15056938f7b599646" dependencies = [ "character_converter", "cow-utils", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 8fbcba67b..dbf706e2c 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,7 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.1" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.2" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index c78d1f4ff..d83c02a28 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -140,7 +140,7 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< tokens .scan((0, None), |(offset, prev_kind), token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, @@ -227,7 +227,7 @@ mod tests { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); let analyzer = analyzer.analyze(text); let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect(); - assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目", "词", "的", "带", "标记", "平衡", "语料库"]); + assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目词", "的", "带", "标记", "平衡", "语料库"]); } #[test] diff --git a/meilisearch-http/tests/search.rs b/meilisearch-http/tests/search.rs index 82804a019..cd1fae4bd 100644 --- a/meilisearch-http/tests/search.rs +++ b/meilisearch-http/tests/search.rs @@ -401,7 +401,7 @@ async fn search_with_attribute_to_highlight_wildcard_chinese() { "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", "address": "吴国", - "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安氏始祖,次子孙明为富春氏始祖。\r\n", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197, From aeb676e757af7412567ee25b7533f7c880f57747 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Dec 2020 17:18:11 +0100 Subject: [PATCH 18/22] skip indexation while token is not a word --- meilisearch-core/src/raw_indexer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index d83c02a28..a61a6f89c 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -138,6 +138,7 @@ where fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { tokens + .skip_while(|token| !token.is_word()) .scan((0, None), |(offset, prev_kind), token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { From 2a145e288c608f59348beafe943a3f9aed5775a7 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 8 Dec 2020 16:03:05 +0100 Subject: [PATCH 19/22] fix style --- meilisearch-core/src/raw_indexer.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index a61a6f89c..16fad6604 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -83,8 +83,8 @@ where let analyzed_text = self.analyzer.analyze(text); let tokens = process_tokens(analyzed_text.tokens()) .map(|(i, mut t)| { - t.byte_start = t.byte_start + current_byte_offset; - t.byte_end = t.byte_end + current_byte_offset; + t.byte_start += current_byte_offset; + t.byte_end += current_byte_offset; (i + current_word_offset, t) }) .enumerate(); @@ -163,6 +163,7 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< .filter(|(_, t)| t.is_word()) } +#[allow(clippy::too_many_arguments)] fn index_token( token: Token, word_pos: usize, From c290719984e461f8a9354b797d5d45e90ec88ef9 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 4 Jan 2021 14:54:19 +0100 Subject: [PATCH 20/22] remove byte offset in index_seq --- meilisearch-core/src/raw_indexer.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 16fad6604..3aded1ca5 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -73,25 +73,18 @@ where where I: IntoIterator, { - let mut byte_offset = 0; let mut word_offset = 0; for text in text_iter.into_iter() { - let current_byte_offset = byte_offset; let current_word_offset = word_offset; let analyzed_text = self.analyzer.analyze(text); let tokens = process_tokens(analyzed_text.tokens()) - .map(|(i, mut t)| { - t.byte_start += current_byte_offset; - t.byte_end += current_byte_offset; - (i + current_word_offset, t) - }) + .map(|(i, t)| (i + current_word_offset, t)) .enumerate(); for (token_pos, (word_pos, token)) in tokens { word_offset = word_pos + 1; - byte_offset = token.byte_end + 1; let must_continue = index_token( token, From 07319713006587cef28ab9503efcba64756ca413 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 4 Jan 2021 15:18:32 +0100 Subject: [PATCH 21/22] fix style --- meilisearch-core/src/query_builder.rs | 32 ++++++++++----------- meilisearch-http/src/helpers/meilisearch.rs | 3 +- meilisearch-http/src/helpers/mod.rs | 32 ++++++++++----------- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 64203a6c7..bd02e7281 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -304,22 +304,22 @@ mod tests { use meilisearch_schema::Schema; fn is_cjk(c: char) -> bool { - (c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo - || (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement - || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical - || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation - || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana - || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana - || (c >= '\u{3100}' && c <= '\u{312f}') - || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo - || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months - || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A - || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs - || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A - || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables - || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B - || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs - || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana + ('\u{1100}'..'\u{11ff}').contains(&c) // Hangul Jamo + || ('\u{2e80}'..'\u{2eff}').contains(&c) // CJK Radicals Supplement + || ('\u{2f00}'..'\u{2fdf}').contains(&c) // Kangxi radical + || ('\u{3000}'..'\u{303f}').contains(&c) // Japanese-style punctuation + || ('\u{3040}'..'\u{309f}').contains(&c) // Japanese Hiragana + || ('\u{30a0}'..'\u{30ff}').contains(&c) // Japanese Katakana + || ('\u{3100}'..'\u{312f}').contains(&c) + || ('\u{3130}'..'\u{318F}').contains(&c) // Hangul Compatibility Jamo + || ('\u{3200}'..'\u{32ff}').contains(&c) // Enclosed CJK Letters and Months + || ('\u{3400}'..'\u{4dbf}').contains(&c) // CJK Unified Ideographs Extension A + || ('\u{4e00}'..'\u{9fff}').contains(&c) // CJK Unified Ideographs + || ('\u{a960}'..'\u{a97f}').contains(&c) // Hangul Jamo Extended-A + || ('\u{ac00}'..'\u{d7a3}').contains(&c) // Hangul Syllables + || ('\u{d7b0}'..'\u{d7ff}').contains(&c) // Hangul Jamo Extended-B + || ('\u{f900}'..'\u{faff}').contains(&c) // CJK Compatibility Ideographs + || ('\u{ff00}'..'\u{ffef}').contains(&c) // Full-width roman characters and half-width katakana } fn normalize_str(string: &str) -> String { diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 57186a2d8..1cf25e315 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -479,7 +479,7 @@ fn calculate_highlights( for (attribute, matches) in matches.iter() { if attributes_to_highlight.contains(attribute) { if let Some(Value::String(value)) = document.get(attribute) { - let value = value.clone(); + let value = value; let mut highlighted_value = String::new(); let mut index = 0; @@ -613,7 +613,6 @@ mod tests { assert_eq!(result, result_expected); } - #[test] fn highlight_longest_match() { let data = r#"{ diff --git a/meilisearch-http/src/helpers/mod.rs b/meilisearch-http/src/helpers/mod.rs index 9ba62a3a7..9a78e6b71 100644 --- a/meilisearch-http/src/helpers/mod.rs +++ b/meilisearch-http/src/helpers/mod.rs @@ -7,20 +7,20 @@ pub use authentication::Authentication; pub use normalize_path::NormalizePath; pub fn is_cjk(c: char) -> bool { - (c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo - || (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement - || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical - || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation - || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana - || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana - || (c >= '\u{3100}' && c <= '\u{312f}') - || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo - || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months - || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A - || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs - || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A - || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables - || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B - || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs - || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana + ('\u{1100}'..'\u{11ff}').contains(&c) // Hangul Jamo + || ('\u{2e80}'..'\u{2eff}').contains(&c) // CJK Radicals Supplement + || ('\u{2f00}'..'\u{2fdf}').contains(&c) // Kangxi radical + || ('\u{3000}'..'\u{303f}').contains(&c) // Japanese-style punctuation + || ('\u{3040}'..'\u{309f}').contains(&c) // Japanese Hiragana + || ('\u{30a0}'..'\u{30ff}').contains(&c) // Japanese Katakana + || ('\u{3100}'..'\u{312f}').contains(&c) + || ('\u{3130}'..'\u{318F}').contains(&c) // Hangul Compatibility Jamo + || ('\u{3200}'..'\u{32ff}').contains(&c) // Enclosed CJK Letters and Months + || ('\u{3400}'..'\u{4dbf}').contains(&c) // CJK Unified Ideographs Extension A + || ('\u{4e00}'..'\u{9fff}').contains(&c) // CJK Unified Ideographs + || ('\u{a960}'..'\u{a97f}').contains(&c) // Hangul Jamo Extended-A + || ('\u{ac00}'..'\u{d7a3}').contains(&c) // Hangul Syllables + || ('\u{d7b0}'..'\u{d7ff}').contains(&c) // Hangul Jamo Extended-B + || ('\u{f900}'..'\u{faff}').contains(&c) // CJK Compatibility Ideographs + || ('\u{ff00}'..'\u{ffef}').contains(&c) // Full-width roman characters and half-width katakana } From 677627586c070e4a4710d6cbe43384c5bea4a67d Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Jan 2021 15:27:24 +0100 Subject: [PATCH 22/22] fix test set fix dump tests --- Cargo.lock | 183 +++++++++++++----- meilisearch-core/src/query_builder.rs | 2 +- .../assets/dumps/v1/test/documents.jsonl | 2 +- meilisearch-http/tests/assets/test_set.json | 6 +- meilisearch-http/tests/search.rs | 12 +- 5 files changed, 147 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50bb4e9d6..53ad2c3d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -210,7 +210,7 @@ dependencies = [ "rustls 0.18.1", "tokio-rustls", "webpki", - "webpki-roots", + "webpki-roots 0.20.0", ] [[package]] @@ -299,6 +299,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" + [[package]] name = "ahash" version = "0.4.7" @@ -332,7 +338,7 @@ checksum = "d4d7d63395147b81a9e570bcc6243aaf71c017bd666d4909cfef0085bdda8d73" [[package]] name = "assert-json-diff" version = "1.0.1" -source = "git+https://github.com/qdequele/assert-json-diff?branch=master#9012a0c8866d0f2db0ef9a6242e4a19d1e8c67e4" +source = "git+https://github.com/qdequele/assert-json-diff#9012a0c8866d0f2db0ef9a6242e4a19d1e8c67e4" dependencies = [ "serde", "serde_json", @@ -383,7 +389,7 @@ dependencies = [ "actix-rt", "actix-service", "base64 0.13.0", - "bytes", + "bytes 0.5.6", "cfg-if 1.0.0", "derive_more", "futures-core", @@ -570,6 +576,15 @@ version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" +[[package]] +name = "cedarwood" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -582,6 +597,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "character_converter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +dependencies = [ + "bincode", +] + [[package]] name = "chrono" version = "0.4.19" @@ -790,12 +814,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "data-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993a608597367c6377b258c25d7120740f00ed23a2252b729b1932dd7866f908" - [[package]] name = "debugid" version = "0.7.2" @@ -968,6 +986,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "fs_extra" version = "1.2.0" @@ -1179,13 +1207,23 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177" +[[package]] +name = "hashbrown" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" +dependencies = [ + "ahash 0.3.8", + "autocfg", +] + [[package]] name = "hashbrown" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" dependencies = [ - "ahash", + "ahash 0.4.7", "serde", ] @@ -1225,9 +1263,9 @@ checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" [[package]] name = "heed-types" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fc61caee13e85ea330eabf0c6c7098c511ff173bcb57a760b1eda3bba9f6eb" +checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" dependencies = [ "bincode", "heed-traits", @@ -1376,7 +1414,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.9.1", "serde", ] @@ -1470,6 +1508,21 @@ dependencies = [ "libc", ] +[[package]] +name = "jieba-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.9.1", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "js-sys" version = "0.3.46" @@ -1512,9 +1565,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.80" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" +checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb" [[package]] name = "linked-hash-map" @@ -1602,7 +1655,7 @@ dependencies = [ "either", "env_logger 0.8.2", "fst", - "hashbrown", + "hashbrown 0.9.1", "heed", "indexmap", "intervaltree", @@ -1612,6 +1665,7 @@ dependencies = [ "log", "meilisearch-error", "meilisearch-schema", + "meilisearch-tokenizer", "meilisearch-types", "once_cell", "ordered-float", @@ -1987,6 +2041,44 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand 0.7.3", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "0.4.27" @@ -2153,6 +2245,7 @@ dependencies = [ "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", + "rand_pcg", ] [[package]] @@ -2238,6 +2331,15 @@ dependencies = [ "rand_core 0.6.1", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.4.0" @@ -2355,7 +2457,6 @@ dependencies = [ "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-bindgen-test", "web-sys", "webpki-roots 0.20.0", "winreg 0.7.0", @@ -2462,12 +2563,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "scoped-tls" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" - [[package]] name = "scopeguard" version = "1.1.0" @@ -3397,30 +3492,6 @@ name = "web-sys" version = "0.3.46" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222b1ef9334f92a21d3fb53dc3fd80f30836959a90f9274a626d7e06315ba3c3" -dependencies = [ - "console_error_panic_hook", - "js-sys", - "scoped-tls", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-bindgen-test-macro", -] - -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "proc-macro2", - "quote", -] - -[[package]] -name = "webpki-roots" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f20dea7535251981a9670857150d571846545088359b28e4951d350bdaf179f" dependencies = [ "js-sys", "wasm-bindgen", @@ -3436,6 +3507,15 @@ dependencies = [ "untrusted", ] +[[package]] +name = "webpki-roots" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f20dea7535251981a9670857150d571846545088359b28e4951d350bdaf179f" +dependencies = [ + "webpki", +] + [[package]] name = "webpki-roots" version = "0.21.0" @@ -3445,6 +3525,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "whatlang" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "whoami" version = "1.0.3" diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index bd02e7281..41acaeb7a 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -444,7 +444,7 @@ mod tests { for index in indexes { let name = index.attribute.to_string(); schema.insert(&name).unwrap(); - let indexed_pos = schema.set_indexed(&name).unwrap().1; + let indexed_pos = schema.insert_with_position(&name).unwrap().1; let index = DocIndex { attribute: indexed_pos.0, ..*index diff --git a/meilisearch-http/tests/assets/dumps/v1/test/documents.jsonl b/meilisearch-http/tests/assets/dumps/v1/test/documents.jsonl index 7af80f342..19539cedd 100644 --- a/meilisearch-http/tests/assets/dumps/v1/test/documents.jsonl +++ b/meilisearch-http/tests/assets/dumps/v1/test/documents.jsonl @@ -74,4 +74,4 @@ {"id":73,"isActive":false,"balance":"$1,239.74","picture":"http://placehold.it/32x32","age":38,"color":"blue","name":"Eleanor Shepherd","gender":"female","email":"eleanorshepherd@chorizon.com","phone":"+1 (894) 567-2617","address":"670 Lafayette Walk, Darlington, Palau, 8803","about":"Adipisicing ad incididunt id veniam magna cupidatat et labore eu deserunt mollit. Lorem voluptate exercitation elit eu aliquip cupidatat occaecat anim excepteur reprehenderit est est. Ipsum excepteur ea mollit qui nisi laboris ex qui. Cillum velit culpa culpa commodo laboris nisi Lorem non elit deserunt incididunt. Officia quis velit nulla sint incididunt duis mollit tempor adipisicing qui officia eu nisi Lorem. Do proident pariatur ex enim nostrud eu aute esse deserunt eu velit quis culpa exercitation. Occaecat ad cupidatat ullamco consequat duis anim deserunt occaecat aliqua sunt consectetur ipsum magna.\r\n","registered":"2020-02-29T12:15:28 -01:00","latitude":35.749621,"longitude":-94.40842,"tags":["good first issue","new issue","new issue","bug"]} {"id":74,"isActive":true,"balance":"$1,180.90","picture":"http://placehold.it/32x32","age":36,"color":"Green","name":"Stark Wong","gender":"male","email":"starkwong@chorizon.com","phone":"+1 (805) 575-3055","address":"522 Bond Street, Bawcomville, Wisconsin, 324","about":"Aute qui sit incididunt eu adipisicing exercitation sunt nostrud. Id laborum incididunt proident ipsum est cillum esse. Officia ullamco eu ut Lorem do minim ea dolor consequat sit eu est voluptate. Id commodo cillum enim culpa aliquip ullamco nisi Lorem cillum ipsum cupidatat anim officia eu. Dolore sint elit labore pariatur. Officia duis nulla voluptate et nulla ut voluptate laboris eu commodo veniam qui veniam.\r\n","registered":"2020-01-25T10:47:48 -01:00","latitude":-80.452139,"longitude":160.72546,"tags":["wontfix"]} {"id":75,"isActive":false,"balance":"$1,913.42","picture":"http://placehold.it/32x32","age":24,"color":"Green","name":"Emma Jacobs","gender":"female","email":"emmajacobs@chorizon.com","phone":"+1 (899) 554-3847","address":"173 Tapscott Street, Esmont, Maine, 7450","about":"Laboris consequat consectetur tempor labore ullamco ullamco voluptate quis quis duis ut ad. In est irure quis amet sunt nulla ad ut sit labore ut eu quis duis. Nostrud cupidatat aliqua sunt occaecat minim id consequat officia deserunt laborum. Ea dolor reprehenderit laborum veniam exercitation est nostrud excepteur laborum minim id qui et.\r\n","registered":"2019-03-29T06:24:13 -01:00","latitude":-35.53722,"longitude":155.703874,"tags":[]} -{"id":76,"isActive":false,"balance":"$1,274.29","picture":"http://placehold.it/32x32","age":25,"color":"Green","name":"Clarice Gardner","gender":"female","email":"claricegardner@chorizon.com","phone":"+1 (810) 407-3258","address":"894 Brooklyn Road, Utting, New Hampshire, 6404","about":"Elit occaecat aute ea adipisicing mollit cupidatat aliquip excepteur veniam minim. Sunt quis dolore in commodo aute esse quis. Lorem in cillum commodo eu anim commodo mollit. Adipisicing enim sunt adipisicing cupidatat adipisicing eiusmod eu do sit nisi.\r\n","registered":"2014-10-20T10:13:32 -02:00","latitude":17.11935,"longitude":65.38197,"tags":["new issue","wontfix"]} \ No newline at end of file +{"id":77,"isActive":false,"balance":"$1,274.29","picture":"http://placehold.it/32x32","age":25,"color":"Red","name":"孫武","gender":"male","email":"SunTzu@chorizon.com","phone":"+1 (810) 407-3258","address":"吴國","about":"孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孫子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖,次子孙明为富春孫氏始祖。\r\n","registered":"2014-10-20T10:13:32 -02:00","latitude":17.11935,"longitude":65.38197,"tags":["new issue","wontfix"]} diff --git a/meilisearch-http/tests/assets/test_set.json b/meilisearch-http/tests/assets/test_set.json index b035ac334..cd3ed9633 100644 --- a/meilisearch-http/tests/assets/test_set.json +++ b/meilisearch-http/tests/assets/test_set.json @@ -1596,12 +1596,12 @@ "picture": "http://placehold.it/32x32", "age": 25, "color": "Red", - "name": "孙武", + "name": "孫武", "gender": "male", "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", - "address": "吴国", - "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", + "address": "吴國", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孫子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖,次子孙明为富春孫氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197, diff --git a/meilisearch-http/tests/search.rs b/meilisearch-http/tests/search.rs index cd1fae4bd..9da6b964e 100644 --- a/meilisearch-http/tests/search.rs +++ b/meilisearch-http/tests/search.rs @@ -376,12 +376,12 @@ async fn search_with_attribute_to_highlight_wildcard_chinese() { "picture": "http://placehold.it/32x32", "age": 25, "color": "Red", - "name": "孙武", + "name": "孫武", "gender": "male", "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", - "address": "吴国", - "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", + "address": "吴國", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孫子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖,次子孙明为富春孫氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197, @@ -396,12 +396,12 @@ async fn search_with_attribute_to_highlight_wildcard_chinese() { "picture": "http://placehold.it/32x32", "age": 25, "color": "Red", - "name": "孙武", + "name": "孫武", "gender": "male", "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", - "address": "吴国", - "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", + "address": "吴國", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孫子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为樂安孫氏始祖,次子孙明为富春孫氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197,