mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-26 21:27:28 +01:00
feat: Index unidecoded words
This commit is contained in:
parent
7035f76077
commit
068f1bc202
@ -18,6 +18,7 @@ serde = { version = "1.0.90", features = ["derive"] }
|
||||
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||
sled = "0.23.0"
|
||||
toml = { version = "0.5.0", features = ["preserve_order"] }
|
||||
deunicode = "1.0.0"
|
||||
|
||||
[dependencies.rmp-serde]
|
||||
git = "https://github.com/3Hren/msgpack-rust.git"
|
||||
|
@ -1,13 +1,14 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use deunicode::deunicode_with_tofu;
|
||||
use meilidb_core::{DocumentId, DocIndex};
|
||||
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
|
||||
use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token};
|
||||
use crate::SchemaAttr;
|
||||
|
||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::SchemaAttr;
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
|
||||
pub struct Indexer {
|
||||
@ -32,18 +33,8 @@ impl Indexer {
|
||||
|
||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
||||
for token in Tokenizer::new(text) {
|
||||
if token.word_index >= self.word_limit { break }
|
||||
|
||||
let lower = token.word.to_lowercase();
|
||||
let token = Token { word: &lower, ..token };
|
||||
|
||||
let docindex = match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => docindex,
|
||||
None => break,
|
||||
};
|
||||
|
||||
let word = Vec::from(token.word);
|
||||
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
||||
if !must_continue { break }
|
||||
}
|
||||
}
|
||||
|
||||
@ -52,18 +43,8 @@ impl Indexer {
|
||||
{
|
||||
let iter = iter.into_iter();
|
||||
for token in SeqTokenizer::new(iter) {
|
||||
if token.word_index >= self.word_limit { break }
|
||||
|
||||
let lower = token.word.to_lowercase();
|
||||
let token = Token { word: &lower, ..token };
|
||||
|
||||
let docindex = match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => docindex,
|
||||
None => break,
|
||||
};
|
||||
|
||||
let word = Vec::from(token.word);
|
||||
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
||||
if !must_continue { break }
|
||||
}
|
||||
}
|
||||
|
||||
@ -82,7 +63,44 @@ impl Indexer {
|
||||
}
|
||||
}
|
||||
|
||||
fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option<DocIndex> {
|
||||
fn index_token(
|
||||
token: Token,
|
||||
id: DocumentId,
|
||||
attr: SchemaAttr,
|
||||
word_limit: usize,
|
||||
indexed: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||
) -> bool
|
||||
{
|
||||
if token.word_index >= word_limit { return false }
|
||||
|
||||
let lower = token.word.to_lowercase();
|
||||
let token = Token { word: &lower, ..token };
|
||||
match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word);
|
||||
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||
},
|
||||
None => return false,
|
||||
}
|
||||
|
||||
if !lower.contains(is_cjk) {
|
||||
let unidecoded = deunicode_with_tofu(&lower, "");
|
||||
if unidecoded != lower {
|
||||
let token = Token { word: &unidecoded, ..token };
|
||||
match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word);
|
||||
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||
},
|
||||
None => return false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
|
||||
let word_index = u16::try_from(token.word_index).ok()?;
|
||||
let char_index = u16::try_from(token.char_index).ok()?;
|
||||
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||
|
Loading…
x
Reference in New Issue
Block a user