feat: Reintroduce stopwords for the serializer

This commit is contained in:
Clément Renault 2019-01-06 18:03:47 +01:00
parent a2f5e8aa25
commit 32f8908d71
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
7 changed files with 276 additions and 35 deletions

View file

@ -185,6 +185,7 @@ mod tests {
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use hashbrown::HashSet;
use tempfile::tempdir;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
@ -194,6 +195,7 @@ mod tests {
#[test]
fn ingest_one_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -237,8 +239,8 @@ mod tests {
let tokenizer_builder = DefaultBuilder::new();
let mut builder = UpdateBuilder::new(update_path, schema);
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -258,6 +260,7 @@ mod tests {
#[test]
fn ingest_two_update_files() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -312,8 +315,8 @@ mod tests {
let update_path = dir.path().join("update-000.sst");
let mut builder = UpdateBuilder::new(update_path, schema.clone());
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -325,8 +328,8 @@ mod tests {
let update_path = dir.path().join("update-001.sst");
let mut builder = UpdateBuilder::new(update_path, schema);
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -364,8 +367,9 @@ mod bench {
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use rand::seq::SliceRandom;
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use hashbrown::HashSet;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::UpdateBuilder;
@ -394,6 +398,7 @@ mod bench {
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -421,7 +426,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -440,6 +445,7 @@ mod bench {
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -467,7 +473,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -487,6 +493,7 @@ mod bench {
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -514,7 +521,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -533,6 +540,7 @@ mod bench {
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -560,7 +568,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -579,6 +587,7 @@ mod bench {
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -606,7 +615,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -626,6 +635,7 @@ mod bench {
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -653,7 +663,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;

View file

@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
@ -48,6 +50,7 @@ where B: TokenizerBuilder
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
// FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute,
@ -69,6 +72,8 @@ where B: TokenizerBuilder
// insert the exact representation
let word_lower = word.to_lowercase();
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {

View file

@ -1,3 +1,4 @@
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
@ -139,6 +141,7 @@ where B: TokenizerBuilder
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
@ -154,6 +157,7 @@ where B: TokenizerBuilder
update: self.update,
document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
@ -219,6 +224,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
@ -264,6 +271,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}

View file

@ -1,6 +1,7 @@
use std::path::PathBuf;
use std::error::Error;
use hashbrown::HashSet;
use serde::Serialize;
use crate::database::serde::serializer::Serializer;
@ -28,6 +29,7 @@ impl UpdateBuilder {
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
@ -40,6 +42,7 @@ impl UpdateBuilder {
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: update,
stop_words: stop_words,
};
document.serialize(serializer)?;