mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
feat: Index unidecoded words
This commit is contained in:
parent
7035f76077
commit
068f1bc202
@ -18,6 +18,7 @@ serde = { version = "1.0.90", features = ["derive"] }
|
|||||||
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||||
sled = "0.23.0"
|
sled = "0.23.0"
|
||||||
toml = { version = "0.5.0", features = ["preserve_order"] }
|
toml = { version = "0.5.0", features = ["preserve_order"] }
|
||||||
|
deunicode = "1.0.0"
|
||||||
|
|
||||||
[dependencies.rmp-serde]
|
[dependencies.rmp-serde]
|
||||||
git = "https://github.com/3Hren/msgpack-rust.git"
|
git = "https://github.com/3Hren/msgpack-rust.git"
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
|
use deunicode::deunicode_with_tofu;
|
||||||
use meilidb_core::{DocumentId, DocIndex};
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
|
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
|
||||||
use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token};
|
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||||
use crate::SchemaAttr;
|
|
||||||
|
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
|
|
||||||
|
use crate::SchemaAttr;
|
||||||
|
|
||||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
|
|
||||||
pub struct Indexer {
|
pub struct Indexer {
|
||||||
@ -32,18 +33,8 @@ impl Indexer {
|
|||||||
|
|
||||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
||||||
for token in Tokenizer::new(text) {
|
for token in Tokenizer::new(text) {
|
||||||
if token.word_index >= self.word_limit { break }
|
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
||||||
|
if !must_continue { break }
|
||||||
let lower = token.word.to_lowercase();
|
|
||||||
let token = Token { word: &lower, ..token };
|
|
||||||
|
|
||||||
let docindex = match token_to_docindex(id, attr, token) {
|
|
||||||
Some(docindex) => docindex,
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
|
|
||||||
let word = Vec::from(token.word);
|
|
||||||
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -52,18 +43,8 @@ impl Indexer {
|
|||||||
{
|
{
|
||||||
let iter = iter.into_iter();
|
let iter = iter.into_iter();
|
||||||
for token in SeqTokenizer::new(iter) {
|
for token in SeqTokenizer::new(iter) {
|
||||||
if token.word_index >= self.word_limit { break }
|
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
|
||||||
|
if !must_continue { break }
|
||||||
let lower = token.word.to_lowercase();
|
|
||||||
let token = Token { word: &lower, ..token };
|
|
||||||
|
|
||||||
let docindex = match token_to_docindex(id, attr, token) {
|
|
||||||
Some(docindex) => docindex,
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
|
|
||||||
let word = Vec::from(token.word);
|
|
||||||
self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,7 +63,44 @@ impl Indexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option<DocIndex> {
|
fn index_token(
|
||||||
|
token: Token,
|
||||||
|
id: DocumentId,
|
||||||
|
attr: SchemaAttr,
|
||||||
|
word_limit: usize,
|
||||||
|
indexed: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
|
) -> bool
|
||||||
|
{
|
||||||
|
if token.word_index >= word_limit { return false }
|
||||||
|
|
||||||
|
let lower = token.word.to_lowercase();
|
||||||
|
let token = Token { word: &lower, ..token };
|
||||||
|
match token_to_docindex(id, attr, token) {
|
||||||
|
Some(docindex) => {
|
||||||
|
let word = Vec::from(token.word);
|
||||||
|
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||||
|
},
|
||||||
|
None => return false,
|
||||||
|
}
|
||||||
|
|
||||||
|
if !lower.contains(is_cjk) {
|
||||||
|
let unidecoded = deunicode_with_tofu(&lower, "");
|
||||||
|
if unidecoded != lower {
|
||||||
|
let token = Token { word: &unidecoded, ..token };
|
||||||
|
match token_to_docindex(id, attr, token) {
|
||||||
|
Some(docindex) => {
|
||||||
|
let word = Vec::from(token.word);
|
||||||
|
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
|
||||||
|
},
|
||||||
|
None => return false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
|
||||||
let word_index = u16::try_from(token.word_index).ok()?;
|
let word_index = u16::try_from(token.word_index).ok()?;
|
||||||
let char_index = u16::try_from(token.char_index).ok()?;
|
let char_index = u16::try_from(token.char_index).ok()?;
|
||||||
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||||
|
Loading…
Reference in New Issue
Block a user