feat: Reintroduce stopwords for the serializer

This commit is contained in:
Clément Renault 2019-01-06 18:03:47 +01:00
parent a2f5e8aa25
commit 32f8908d71
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
7 changed files with 276 additions and 35 deletions

View File

@ -1,12 +1,13 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use hashbrown::HashMap;
use hashbrown::{HashMap, HashSet};
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
@ -26,6 +27,13 @@ pub struct Opt {
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
@ -34,37 +42,75 @@ struct Document<'a> (
HashMap<Cow<'a, str>, Cow<'a, str>>
);
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path, &schema)?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? {
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
let mut i = 0;
let mut end_of_file = false;
update.update_document(&document, &tokenizer_builder)?;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("building update...");
let update = update.build()?;
println!("ingesting update...");
database.ingest_update_file(update)?;
}
let update = update.build()?;
database.ingest_update_file(update)?;
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path)
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
});
if let Err(e) = result {

View File

@ -95,7 +95,8 @@ or
other
ought
our
ours ourselves
ours
ourselves
out
over
own

163
misc/fr.stopwords.txt Normal file
View File

@ -0,0 +1,163 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
je
la
le
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
ceci
celà
cet
cette
ici
ils
les
leurs
quel
quels
quelle
quelles
sans
soi

View File

@ -185,6 +185,7 @@ mod tests {
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use hashbrown::HashSet;
use tempfile::tempdir;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
@ -194,6 +195,7 @@ mod tests {
#[test]
fn ingest_one_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -237,8 +239,8 @@ mod tests {
let tokenizer_builder = DefaultBuilder::new();
let mut builder = UpdateBuilder::new(update_path, schema);
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -258,6 +260,7 @@ mod tests {
#[test]
fn ingest_two_update_files() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -312,8 +315,8 @@ mod tests {
let update_path = dir.path().join("update-000.sst");
let mut builder = UpdateBuilder::new(update_path, schema.clone());
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -325,8 +328,8 @@ mod tests {
let update_path = dir.path().join("update-001.sst");
let mut builder = UpdateBuilder::new(update_path, schema);
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
builder.build()?
};
@ -364,8 +367,9 @@ mod bench {
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use rand::seq::SliceRandom;
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use hashbrown::HashSet;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::UpdateBuilder;
@ -394,6 +398,7 @@ mod bench {
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -421,7 +426,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -440,6 +445,7 @@ mod bench {
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -467,7 +473,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -487,6 +493,7 @@ mod bench {
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -514,7 +521,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -533,6 +540,7 @@ mod bench {
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -560,7 +568,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -579,6 +587,7 @@ mod bench {
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -606,7 +615,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
@ -626,6 +635,7 @@ mod bench {
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
@ -653,7 +663,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder)?;
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;

View File

@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
@ -48,6 +50,7 @@ where B: TokenizerBuilder
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
// FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute,
@ -69,6 +72,8 @@ where B: TokenizerBuilder
// insert the exact representation
let word_lower = word.to_lowercase();
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {

View File

@ -1,3 +1,4 @@
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
@ -139,6 +141,7 @@ where B: TokenizerBuilder
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
@ -154,6 +157,7 @@ where B: TokenizerBuilder
update: self.update,
document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
@ -219,6 +224,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
@ -264,6 +271,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}

View File

@ -1,6 +1,7 @@
use std::path::PathBuf;
use std::error::Error;
use hashbrown::HashSet;
use serde::Serialize;
use crate::database::serde::serializer::Serializer;
@ -28,6 +29,7 @@ impl UpdateBuilder {
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
@ -40,6 +42,7 @@ impl UpdateBuilder {
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: update,
stop_words: stop_words,
};
document.serialize(serializer)?;