mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 21:20:24 +01:00
Merge pull request #272 from meilisearch/fix-long-words
Ignore words that are too long
This commit is contained in:
commit
8a36571a74
@ -387,6 +387,56 @@ mod tests {
|
|||||||
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
|
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ignored_words_to_long() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
|
||||||
|
let database = Database::open_or_create(dir.path()).unwrap();
|
||||||
|
let env = &database.env;
|
||||||
|
|
||||||
|
let (sender, receiver) = mpsc::sync_channel(100);
|
||||||
|
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
|
||||||
|
let index = database.create_index("test").unwrap();
|
||||||
|
|
||||||
|
let done = database.set_update_callback("test", Box::new(update_fn));
|
||||||
|
assert!(done, "could not set the index update function");
|
||||||
|
|
||||||
|
let schema = {
|
||||||
|
let data = r#"
|
||||||
|
identifier = "id"
|
||||||
|
|
||||||
|
[attributes."name"]
|
||||||
|
displayed = true
|
||||||
|
indexed = true
|
||||||
|
"#;
|
||||||
|
toml::from_str(data).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut writer = env.write_txn().unwrap();
|
||||||
|
let _update_id = index.schema_update(&mut writer, schema).unwrap();
|
||||||
|
writer.commit().unwrap();
|
||||||
|
|
||||||
|
let mut additions = index.documents_addition();
|
||||||
|
|
||||||
|
let doc1 = serde_json::json!({
|
||||||
|
"id": 123,
|
||||||
|
"name": "s̷̡̢̡̧̺̜̞͕͉͉͕̜͔̟̼̥̝͍̟̖͔͔̪͉̲̹̝̣̖͎̞̤̥͓͎̭̩͕̙̩̿̀̋̅̈́̌́̏̍̄̽͂̆̾̀̿̕̚̚͜͠͠ͅͅļ̵̨̨̨̰̦̻̳̖̳͚̬̫͚̦͖͈̲̫̣̩̥̻̙̦̱̼̠̖̻̼̘̖͉̪̜̠̙͖̙̩͔̖̯̩̲̿̽͋̔̿̍̓͂̍̿͊͆̃͗̔̎͐͌̾̆͗́̆̒̔̾̅̚̚͜͜ͅͅī̵̛̦̅̔̓͂͌̾́͂͛̎̋͐͆̽̂̋̋́̾̀̉̓̏̽́̑̀͒̇͋͛̈́̃̉̏͊̌̄̽̿̏̇͘̕̚̕p̶̧̛̛̖̯̗͕̝̗̭̱͙̖̗̟̟̐͆̊̂͐̋̓̂̈́̓͊̆͌̾̾͐͋͗͌̆̿̅͆̈́̈́̉͋̍͊͗̌̓̅̈̎̇̃̎̈́̉̐̋͑̃͘̕͘d̴̢̨̛͕̘̯͖̭̮̝̝̐̊̈̅̐̀͒̀́̈́̀͌̽͛͆͑̀̽̿͛̃̋̇̎̀́̂́͘͠͝ǫ̵̨̛̮̩̘͚̬̯̖̱͍̼͑͑̓̐́̑̿̈́̔͌̂̄͐͝ģ̶̧̜͇̣̭̺̪̺̖̻͖̮̭̣̙̻͒͊͗̓̓͒̀̀ͅ",
|
||||||
|
});
|
||||||
|
|
||||||
|
additions.update_document(doc1);
|
||||||
|
|
||||||
|
let mut writer = env.write_txn().unwrap();
|
||||||
|
let update_id = additions.finalize(&mut writer).unwrap();
|
||||||
|
writer.commit().unwrap();
|
||||||
|
|
||||||
|
// block until the transaction is processed
|
||||||
|
let _ = receiver.into_iter().find(|id| *id == update_id);
|
||||||
|
|
||||||
|
let reader = env.read_txn().unwrap();
|
||||||
|
let result = index.update_status(&reader, update_id).unwrap();
|
||||||
|
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_schema_attributes_at_end() {
|
fn add_schema_attributes_at_end() {
|
||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
@ -7,6 +7,8 @@ use meilidb_schema::SchemaAttr;
|
|||||||
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
|
const WORD_LENGTH_LIMIT: usize = 80;
|
||||||
|
|
||||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
|
|
||||||
pub struct RawIndexer {
|
pub struct RawIndexer {
|
||||||
@ -128,21 +130,26 @@ fn index_token(
|
|||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
words_doc_indexes
|
|
||||||
.entry(word.clone())
|
|
||||||
.or_insert_with(Vec::new)
|
|
||||||
.push(docindex);
|
|
||||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
|
||||||
|
|
||||||
if !lower.contains(is_cjk) {
|
if word.len() <= WORD_LENGTH_LIMIT {
|
||||||
let unidecoded = deunicode_with_tofu(&lower, "");
|
words_doc_indexes
|
||||||
if unidecoded != lower && !unidecoded.is_empty() {
|
.entry(word.clone())
|
||||||
let word = Vec::from(unidecoded);
|
.or_insert_with(Vec::new)
|
||||||
words_doc_indexes
|
.push(docindex);
|
||||||
.entry(word.clone())
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||||
.or_insert_with(Vec::new)
|
|
||||||
.push(docindex);
|
if !lower.contains(is_cjk) {
|
||||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
let unidecoded = deunicode_with_tofu(&lower, "");
|
||||||
|
if unidecoded != lower && !unidecoded.is_empty() {
|
||||||
|
let word = Vec::from(unidecoded);
|
||||||
|
if word.len() <= WORD_LENGTH_LIMIT {
|
||||||
|
words_doc_indexes
|
||||||
|
.entry(word.clone())
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push(docindex);
|
||||||
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user