Make the RawIndexer support stop words

This commit is contained in:
Clément Renault 2019-10-29 15:53:45 +01:00
parent a226fd23c3
commit ff7dde7522
2 changed files with 28 additions and 13 deletions

View File

@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer { pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>, docs_words: HashMap<DocumentId, Vec<Word>>,
} }
@ -21,13 +22,14 @@ pub struct Indexed {
} }
impl RawIndexer { impl RawIndexer {
pub fn new() -> RawIndexer { pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(1000) RawIndexer::with_word_limit(stop_words, 1000)
} }
pub fn with_word_limit(limit: usize) -> RawIndexer { pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer { RawIndexer {
word_limit: limit, word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(), words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(), docs_words: HashMap::new(),
} }
@ -56,6 +58,7 @@ impl RawIndexer {
id, id,
attr, attr,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -87,6 +90,7 @@ impl RawIndexer {
id, id,
attr, attr,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -118,6 +122,7 @@ impl RawIndexer {
id, id,
attr, attr,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -152,17 +157,12 @@ impl RawIndexer {
} }
} }
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
}
}
fn index_token( fn index_token(
token: Token, token: Token,
id: DocumentId, id: DocumentId,
attr: SchemaAttr, attr: SchemaAttr,
word_limit: usize, word_limit: usize,
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool { ) -> bool {
@ -170,6 +170,10 @@ fn index_token(
return false; return false;
} }
if stop_words.contains(&token.word) {
return false;
}
match token_to_docindex(id, attr, token) { match token_to_docindex(id, attr, token) {
Some(docindex) => { Some(docindex) => {
let word = Vec::from(token.word); let word = Vec::from(token.word);
@ -207,7 +211,7 @@ mod tests {
#[test] #[test]
fn strange_apostrophe() { fn strange_apostrophe() {
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0); let docid = DocumentId(0);
let attr = SchemaAttr(0); let attr = SchemaAttr(0);
@ -231,7 +235,7 @@ mod tests {
#[test] #[test]
fn strange_apostrophe_in_sequence() { fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0); let docid = DocumentId(0);
let attr = SchemaAttr(0); let attr = SchemaAttr(0);

View File

@ -87,7 +87,6 @@ pub fn apply_documents_addition(
addition: Vec<serde_json::Value>, addition: Vec<serde_json::Value>,
) -> MResult<()> { ) -> MResult<()> {
let mut documents_additions = HashMap::new(); let mut documents_additions = HashMap::new();
let mut indexer = RawIndexer::new();
let schema = match main_store.schema(writer)? { let schema = match main_store.schema(writer)? {
Some(schema) => schema, Some(schema) => schema,
@ -124,7 +123,14 @@ pub fn apply_documents_addition(
None => RankedMap::default(), None => RankedMap::default(),
}; };
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores // 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions { for (document_id, document) in documents_additions {
let serializer = Serializer { let serializer = Serializer {
txn: writer, txn: writer,
@ -180,8 +186,13 @@ pub fn reindex_all_documents(
postings_lists_store.clear(writer)?; postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?; docs_words_store.clear(writer)?;
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy) // 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new(); let mut ram_store = HashMap::new();
for document_id in documents_ids_to_reindex { for document_id in documents_ids_to_reindex {