Merge pull request #272 from meilisearch/fix-long-words

Ignore words that are too long
2025-06-12 11:01:36 +02:00 · 2019-11-10 20:07:22 +01:00 · 2019-11-10 20:07:22 +01:00 · 8a36571a74
commit 8a36571a74
parent 78381f1818 d18e775bec
2 changed files with 71 additions and 14 deletions
--- a/meilidb-core/src/database.rs
+++ b/meilidb-core/src/database.rs
@ -387,6 +387,56 @@ mod tests {
        assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
    }
    #[test]
    fn ignored_words_to_long() {
        let dir = tempfile::tempdir().unwrap();
        let database = Database::open_or_create(dir.path()).unwrap();
        let env = &database.env;
        let (sender, receiver) = mpsc::sync_channel(100);
        let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
        let index = database.create_index("test").unwrap();
        let done = database.set_update_callback("test", Box::new(update_fn));
        assert!(done, "could not set the index update function");
        let schema = {
            let data = r#"
                identifier = "id"
                [attributes."name"]
                displayed = true
                indexed = true
            "#;
            toml::from_str(data).unwrap()
        };
        let mut writer = env.write_txn().unwrap();
        let _update_id = index.schema_update(&mut writer, schema).unwrap();
        writer.commit().unwrap();
        let mut additions = index.documents_addition();
        let doc1 = serde_json::json!({
            "id": 123,
            "name": "s̷̡̢̡̧̺̜̞͕͉͉͕̜͔̟̼̥̝͍̟̖͔͔̪͉̲̹̝̣̖͎̞̤̥͓͎̭̩͕̙̩̿̀̋̅̈́̌́̏̍̄̽͂̆̾̀̿̕̚̚͜͠͠ͅͅļ̵̨̨̨̰̦̻̳̖̳͚̬̫͚̦͖͈̲̫̣̩̥̻̙̦̱̼̠̖̻̼̘̖͉̪̜̠̙͖̙̩͔̖̯̩̲̿̽͋̔̿̍̓͂̍̿͊͆̃͗̔̎͐͌̾̆͗́̆̒̔̾̅̚̚͜͜ͅͅī̵̛̦̅̔̓͂͌̾́͂͛̎̋͐͆̽̂̋̋́̾̀̉̓̏̽́̑̀͒̇͋͛̈́̃̉̏͊̌̄̽̿̏̇͘̕̚̕p̶̧̛̛̖̯̗͕̝̗̭̱͙̖̗̟̟̐͆̊̂͐̋̓̂̈́̓͊̆͌̾̾͐͋͗͌̆̿̅͆̈́̈́̉͋̍͊͗̌̓̅̈̎̇̃̎̈́̉̐̋͑̃͘̕͘d̴̢̨̛͕̘̯͖̭̮̝̝̐̊̈̅̐̀͒̀́̈́̀͌̽͛͆͑̀̽̿͛̃̋̇̎̀́̂́͘͠͝ǫ̵̨̛̮̩̘͚̬̯̖̱͍̼͑͑̓̐́̑̿̈́̔͌̂̄͐͝ģ̶̧̜͇̣̭̺̪̺̖̻͖̮̭̣̙̻͒͊͗̓̓͒̀̀ͅ",
        });
        additions.update_document(doc1);
        let mut writer = env.write_txn().unwrap();
        let update_id = additions.finalize(&mut writer).unwrap();
        writer.commit().unwrap();
        // block until the transaction is processed
        let _ = receiver.into_iter().find(|id| *id == update_id);
        let reader = env.read_txn().unwrap();
        let result = index.update_status(&reader, update_id).unwrap();
        assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
    }
    #[test]
    fn add_schema_attributes_at_end() {
        let dir = tempfile::tempdir().unwrap();
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -7,6 +7,8 @@ use meilidb_schema::SchemaAttr;
 use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
 use sdset::SetBuf;
 const WORD_LENGTH_LIMIT: usize = 80;
 type Word = Vec<u8>; // TODO make it be a SmallVec
 pub struct RawIndexer {
@ -128,6 +130,8 @@ fn index_token(
        match token_to_docindex(id, attr, token) {
            Some(docindex) => {
                let word = Vec::from(token.word);
                if word.len() <= WORD_LENGTH_LIMIT {
                    words_doc_indexes
                        .entry(word.clone())
                        .or_insert_with(Vec::new)
@ -138,6 +142,7 @@ fn index_token(
                        let unidecoded = deunicode_with_tofu(&lower, "");
                        if unidecoded != lower && !unidecoded.is_empty() {
                            let word = Vec::from(unidecoded);
                            if word.len() <= WORD_LENGTH_LIMIT {
                                words_doc_indexes
                                    .entry(word.clone())
                                    .or_insert_with(Vec::new)
@ -146,6 +151,8 @@ fn index_token(
                            }
                        }
                    }
                }
            }
            None => return false,
        }
    }