Replace the token filter by a filter mapper

This commit is contained in:
Clément Renault 2020-09-22 10:24:31 +02:00
parent d21c80b865
commit e5adfaade0
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 4 additions and 4 deletions

View File

@ -22,7 +22,7 @@ use roaring::RoaringBitmap;
use structopt::StructOpt;
use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec};
use milli::tokenizer::{simple_tokenizer, only_words};
use milli::tokenizer::{simple_tokenizer, only_token};
use milli::{SmallVec32, Index, DocumentId, BEU32};
const LMDB_MAX_KEY_LENGTH: usize = 511;
@ -290,7 +290,7 @@ impl Store {
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr * MAX_POSITION + pos) as u32;
self.insert_word_docid(&word, document_id)?;

View File

@ -16,6 +16,6 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
})
}
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
*t == TokenType::Word
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
if t == TokenType::Word { Some(w) } else { None }
}