fix: Always lowercase indexed tokens

This commit is contained in:
Clément Renault 2019-04-22 18:43:00 +02:00
parent 7dbf5d6319
commit f0268d49fe
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE

View File

@ -33,6 +33,10 @@ impl Indexer {
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
for token in Tokenizer::new(text) { for token in Tokenizer::new(text) {
if token.word_index >= self.word_limit { break } if token.word_index >= self.word_limit { break }
let lower = token.word.to_lowercase();
let token = Token { word: &lower, ..token };
let docindex = match token_to_docindex(id, attr, token) { let docindex = match token_to_docindex(id, attr, token) {
Some(docindex) => docindex, Some(docindex) => docindex,
None => break, None => break,
@ -49,6 +53,10 @@ impl Indexer {
let iter = iter.into_iter(); let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) { for token in SeqTokenizer::new(iter) {
if token.word_index >= self.word_limit { break } if token.word_index >= self.word_limit { break }
let lower = token.word.to_lowercase();
let token = Token { word: &lower, ..token };
let docindex = match token_to_docindex(id, attr, token) { let docindex = match token_to_docindex(id, attr, token) {
Some(docindex) => docindex, Some(docindex) => docindex,
None => break, None => break,