mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
feat: Reintroduce stopwords for the serializer
This commit is contained in:
parent
a2f5e8aa25
commit
32f8908d71
@ -1,12 +1,13 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::error::Error;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
@ -26,6 +27,13 @@ pub struct Opt {
|
||||
/// The path to the schema.
|
||||
#[structopt(long = "schema", parse(from_os_str))]
|
||||
pub schema_path: PathBuf,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words_path: Option<PathBuf>,
|
||||
|
||||
#[structopt(long = "update-group-size")]
|
||||
pub update_group_size: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@ -34,20 +42,32 @@ struct Document<'a> (
|
||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||
);
|
||||
|
||||
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
|
||||
fn index(
|
||||
schema: Schema,
|
||||
database_path: &Path,
|
||||
csv_data_path: &Path,
|
||||
update_group_size: Option<usize>,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<Database, Box<Error>>
|
||||
{
|
||||
let database = Database::create(database_path, &schema)?;
|
||||
|
||||
println!("start indexing...");
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
while rdr.read_record(&mut raw_record)? {
|
||||
let mut i = 0;
|
||||
let mut end_of_file = false;
|
||||
|
||||
while !end_of_file {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
@ -56,15 +76,41 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
|
||||
}
|
||||
};
|
||||
|
||||
update.update_document(&document, &tokenizer_builder)?;
|
||||
update.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
print!("\rindexing document {}", i);
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
|
||||
println!("building update...");
|
||||
let update = update.build()?;
|
||||
println!("ingesting update...");
|
||||
database.ingest_update_file(update)?;
|
||||
}
|
||||
|
||||
Ok(database)
|
||||
}
|
||||
|
||||
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||
let f = File::open(path)?;
|
||||
let reader = BufReader::new(f);
|
||||
let mut words = HashSet::new();
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
let word = line.trim().to_string();
|
||||
words.insert(word);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
|
||||
Schema::from_toml(file)?
|
||||
};
|
||||
|
||||
let stop_words = match opt.stop_words_path {
|
||||
Some(ref path) => retrieve_stop_words(path)?,
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| {
|
||||
index(schema, &opt.database_path, &opt.csv_data_path)
|
||||
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
|
@ -95,7 +95,8 @@ or
|
||||
other
|
||||
ought
|
||||
our
|
||||
ours ourselves
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
over
|
||||
own
|
||||
|
163
misc/fr.stopwords.txt
Normal file
163
misc/fr.stopwords.txt
Normal file
@ -0,0 +1,163 @@
|
||||
au
|
||||
aux
|
||||
avec
|
||||
ce
|
||||
ces
|
||||
dans
|
||||
de
|
||||
des
|
||||
du
|
||||
elle
|
||||
en
|
||||
et
|
||||
eux
|
||||
il
|
||||
je
|
||||
la
|
||||
le
|
||||
leur
|
||||
lui
|
||||
ma
|
||||
mais
|
||||
me
|
||||
même
|
||||
mes
|
||||
moi
|
||||
mon
|
||||
ne
|
||||
nos
|
||||
notre
|
||||
nous
|
||||
on
|
||||
ou
|
||||
par
|
||||
pas
|
||||
pour
|
||||
qu
|
||||
que
|
||||
qui
|
||||
sa
|
||||
se
|
||||
ses
|
||||
son
|
||||
sur
|
||||
ta
|
||||
te
|
||||
tes
|
||||
toi
|
||||
ton
|
||||
tu
|
||||
un
|
||||
une
|
||||
vos
|
||||
votre
|
||||
vous
|
||||
c
|
||||
d
|
||||
j
|
||||
l
|
||||
à
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
y
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
ceci
|
||||
celà
|
||||
cet
|
||||
cette
|
||||
ici
|
||||
ils
|
||||
les
|
||||
leurs
|
||||
quel
|
||||
quels
|
||||
quelle
|
||||
quelles
|
||||
sans
|
||||
soi
|
@ -185,6 +185,7 @@ mod tests {
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::HashSet;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
@ -194,6 +195,7 @@ mod tests {
|
||||
#[test]
|
||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
@ -237,8 +239,8 @@ mod tests {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
@ -258,6 +260,7 @@ mod tests {
|
||||
#[test]
|
||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
@ -312,8 +315,8 @@ mod tests {
|
||||
let update_path = dir.path().join("update-000.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
@ -325,8 +328,8 @@ mod tests {
|
||||
let update_path = dir.path().join("update-001.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
@ -364,8 +367,9 @@ mod bench {
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rand::seq::SliceRandom;
|
||||
use serde_derive::Serialize;
|
||||
use rand::seq::SliceRandom;
|
||||
use hashbrown::HashSet;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::UpdateBuilder;
|
||||
@ -394,6 +398,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -421,7 +426,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
@ -440,6 +445,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -467,7 +473,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
@ -487,6 +493,7 @@ mod bench {
|
||||
#[ignore]
|
||||
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -514,7 +521,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
@ -533,6 +540,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -560,7 +568,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
@ -579,6 +587,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -606,7 +615,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
@ -626,6 +635,7 @@ mod bench {
|
||||
#[ignore]
|
||||
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
@ -653,7 +663,7 @@ mod bench {
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
@ -48,6 +50,7 @@ where B: TokenizerBuilder
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||
Ok(attribute) => attribute,
|
||||
@ -69,6 +72,8 @@ where B: TokenizerBuilder
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
if self.stop_words.contains(&word_lower) { continue }
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
|
@ -1,3 +1,4 @@
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
@ -139,6 +141,7 @@ where B: TokenizerBuilder
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
@ -154,6 +157,7 @@ where B: TokenizerBuilder
|
||||
update: self.update,
|
||||
document_id: self.document_id,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
}
|
||||
|
||||
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
@ -219,6 +224,7 @@ where B: TokenizerBuilder
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
@ -264,6 +271,7 @@ where B: TokenizerBuilder
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
@ -28,6 +29,7 @@ impl UpdateBuilder {
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
@ -40,6 +42,7 @@ impl UpdateBuilder {
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: update,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
Loading…
Reference in New Issue
Block a user