mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
Replace the token filter by a filter mapper
This commit is contained in:
parent
d21c80b865
commit
e5adfaade0
@ -22,7 +22,7 @@ use roaring::RoaringBitmap;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec};
|
||||
use milli::tokenizer::{simple_tokenizer, only_words};
|
||||
use milli::tokenizer::{simple_tokenizer, only_token};
|
||||
use milli::{SmallVec32, Index, DocumentId, BEU32};
|
||||
|
||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||
@ -290,7 +290,7 @@ impl Store {
|
||||
|
||||
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
|
||||
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||
for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
|
||||
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
||||
let word = token.to_lowercase();
|
||||
let position = (attr * MAX_POSITION + pos) as u32;
|
||||
self.insert_word_docid(&word, document_id)?;
|
||||
|
@ -16,6 +16,6 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
|
||||
*t == TokenType::Word
|
||||
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
|
||||
if t == TokenType::Word { Some(w) } else { None }
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user