Add a test to ensure that the indexer support stop words

This commit is contained in:
Clément Renault 2019-10-29 16:04:48 +01:00
parent ff7dde7522
commit e9dce3ce81

View File

@ -170,20 +170,18 @@ fn index_token(
return false; return false;
} }
if stop_words.contains(&token.word) { if !stop_words.contains(&token.word) {
return false; match token_to_docindex(id, attr, token) {
} Some(docindex) => {
let word = Vec::from(token.word);
match token_to_docindex(id, attr, token) { words_doc_indexes
Some(docindex) => { .entry(word.clone())
let word = Vec::from(token.word); .or_insert_with(Vec::new)
words_doc_indexes .push(docindex);
.entry(word.clone()) docs_words.entry(id).or_insert_with(Vec::new).push(word);
.or_insert_with(Vec::new) }
.push(docindex); None => return false,
docs_words.entry(id).or_insert_with(Vec::new).push(word);
} }
None => return false,
} }
true true
@ -256,4 +254,33 @@ mod tests {
.get(&"léteindre".to_owned().into_bytes()) .get(&"léteindre".to_owned().into_bytes())
.is_some()); .is_some());
} }
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
} }