mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
feat: Reintroduce stopwords for the serializer
This commit is contained in:
parent
a2f5e8aa25
commit
32f8908d71
@ -1,12 +1,13 @@
|
|||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
|
use std::io::{self, BufRead, BufReader};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
use hashbrown::HashMap;
|
use hashbrown::{HashMap, HashSet};
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
@ -26,6 +27,13 @@ pub struct Opt {
|
|||||||
/// The path to the schema.
|
/// The path to the schema.
|
||||||
#[structopt(long = "schema", parse(from_os_str))]
|
#[structopt(long = "schema", parse(from_os_str))]
|
||||||
pub schema_path: PathBuf,
|
pub schema_path: PathBuf,
|
||||||
|
|
||||||
|
/// The path to the list of stop words (one by line).
|
||||||
|
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||||
|
pub stop_words_path: Option<PathBuf>,
|
||||||
|
|
||||||
|
#[structopt(long = "update-group-size")]
|
||||||
|
pub update_group_size: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@ -34,20 +42,32 @@ struct Document<'a> (
|
|||||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||||
);
|
);
|
||||||
|
|
||||||
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
|
fn index(
|
||||||
|
schema: Schema,
|
||||||
|
database_path: &Path,
|
||||||
|
csv_data_path: &Path,
|
||||||
|
update_group_size: Option<usize>,
|
||||||
|
stop_words: &HashSet<String>,
|
||||||
|
) -> Result<Database, Box<Error>>
|
||||||
|
{
|
||||||
let database = Database::create(database_path, &schema)?;
|
let database = Database::create(database_path, &schema)?;
|
||||||
|
|
||||||
println!("start indexing...");
|
|
||||||
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
|
||||||
let update_path = tempfile::NamedTempFile::new()?;
|
|
||||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
|
|
||||||
|
|
||||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||||
let mut raw_record = csv::StringRecord::new();
|
let mut raw_record = csv::StringRecord::new();
|
||||||
let headers = rdr.headers()?.clone();
|
let headers = rdr.headers()?.clone();
|
||||||
|
|
||||||
while rdr.read_record(&mut raw_record)? {
|
let mut i = 0;
|
||||||
|
let mut end_of_file = false;
|
||||||
|
|
||||||
|
while !end_of_file {
|
||||||
|
let tokenizer_builder = DefaultBuilder::new();
|
||||||
|
let update_path = tempfile::NamedTempFile::new()?;
|
||||||
|
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
|
||||||
|
|
||||||
|
loop {
|
||||||
|
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||||
|
if end_of_file { break }
|
||||||
|
|
||||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||||
Ok(document) => document,
|
Ok(document) => document,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@ -56,15 +76,41 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
update.update_document(&document, &tokenizer_builder)?;
|
update.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
|
|
||||||
|
print!("\rindexing document {}", i);
|
||||||
|
i += 1;
|
||||||
|
|
||||||
|
if let Some(group_size) = update_group_size {
|
||||||
|
if i % group_size == 0 { break }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!();
|
||||||
|
|
||||||
|
println!("building update...");
|
||||||
let update = update.build()?;
|
let update = update.build()?;
|
||||||
|
println!("ingesting update...");
|
||||||
database.ingest_update_file(update)?;
|
database.ingest_update_file(update)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(database)
|
Ok(database)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||||
|
let f = File::open(path)?;
|
||||||
|
let reader = BufReader::new(f);
|
||||||
|
let mut words = HashSet::new();
|
||||||
|
|
||||||
|
for line in reader.lines() {
|
||||||
|
let line = line?;
|
||||||
|
let word = line.trim().to_string();
|
||||||
|
words.insert(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(words)
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<(), Box<Error>> {
|
fn main() -> Result<(), Box<Error>> {
|
||||||
let _ = env_logger::init();
|
let _ = env_logger::init();
|
||||||
let opt = Opt::from_args();
|
let opt = Opt::from_args();
|
||||||
@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
Schema::from_toml(file)?
|
Schema::from_toml(file)?
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let stop_words = match opt.stop_words_path {
|
||||||
|
Some(ref path) => retrieve_stop_words(path)?,
|
||||||
|
None => HashSet::new(),
|
||||||
|
};
|
||||||
|
|
||||||
let (elapsed, result) = elapsed::measure_time(|| {
|
let (elapsed, result) = elapsed::measure_time(|| {
|
||||||
index(schema, &opt.database_path, &opt.csv_data_path)
|
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
||||||
});
|
});
|
||||||
|
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
|
@ -95,7 +95,8 @@ or
|
|||||||
other
|
other
|
||||||
ought
|
ought
|
||||||
our
|
our
|
||||||
ours ourselves
|
ours
|
||||||
|
ourselves
|
||||||
out
|
out
|
||||||
over
|
over
|
||||||
own
|
own
|
||||||
|
163
misc/fr.stopwords.txt
Normal file
163
misc/fr.stopwords.txt
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
au
|
||||||
|
aux
|
||||||
|
avec
|
||||||
|
ce
|
||||||
|
ces
|
||||||
|
dans
|
||||||
|
de
|
||||||
|
des
|
||||||
|
du
|
||||||
|
elle
|
||||||
|
en
|
||||||
|
et
|
||||||
|
eux
|
||||||
|
il
|
||||||
|
je
|
||||||
|
la
|
||||||
|
le
|
||||||
|
leur
|
||||||
|
lui
|
||||||
|
ma
|
||||||
|
mais
|
||||||
|
me
|
||||||
|
même
|
||||||
|
mes
|
||||||
|
moi
|
||||||
|
mon
|
||||||
|
ne
|
||||||
|
nos
|
||||||
|
notre
|
||||||
|
nous
|
||||||
|
on
|
||||||
|
ou
|
||||||
|
par
|
||||||
|
pas
|
||||||
|
pour
|
||||||
|
qu
|
||||||
|
que
|
||||||
|
qui
|
||||||
|
sa
|
||||||
|
se
|
||||||
|
ses
|
||||||
|
son
|
||||||
|
sur
|
||||||
|
ta
|
||||||
|
te
|
||||||
|
tes
|
||||||
|
toi
|
||||||
|
ton
|
||||||
|
tu
|
||||||
|
un
|
||||||
|
une
|
||||||
|
vos
|
||||||
|
votre
|
||||||
|
vous
|
||||||
|
c
|
||||||
|
d
|
||||||
|
j
|
||||||
|
l
|
||||||
|
à
|
||||||
|
m
|
||||||
|
n
|
||||||
|
s
|
||||||
|
t
|
||||||
|
y
|
||||||
|
été
|
||||||
|
étée
|
||||||
|
étées
|
||||||
|
étés
|
||||||
|
étant
|
||||||
|
suis
|
||||||
|
es
|
||||||
|
est
|
||||||
|
sommes
|
||||||
|
êtes
|
||||||
|
sont
|
||||||
|
serai
|
||||||
|
seras
|
||||||
|
sera
|
||||||
|
serons
|
||||||
|
serez
|
||||||
|
seront
|
||||||
|
serais
|
||||||
|
serait
|
||||||
|
serions
|
||||||
|
seriez
|
||||||
|
seraient
|
||||||
|
étais
|
||||||
|
était
|
||||||
|
étions
|
||||||
|
étiez
|
||||||
|
étaient
|
||||||
|
fus
|
||||||
|
fut
|
||||||
|
fûmes
|
||||||
|
fûtes
|
||||||
|
furent
|
||||||
|
sois
|
||||||
|
soit
|
||||||
|
soyons
|
||||||
|
soyez
|
||||||
|
soient
|
||||||
|
fusse
|
||||||
|
fusses
|
||||||
|
fût
|
||||||
|
fussions
|
||||||
|
fussiez
|
||||||
|
fussent
|
||||||
|
ayant
|
||||||
|
eu
|
||||||
|
eue
|
||||||
|
eues
|
||||||
|
eus
|
||||||
|
ai
|
||||||
|
as
|
||||||
|
avons
|
||||||
|
avez
|
||||||
|
ont
|
||||||
|
aurai
|
||||||
|
auras
|
||||||
|
aura
|
||||||
|
aurons
|
||||||
|
aurez
|
||||||
|
auront
|
||||||
|
aurais
|
||||||
|
aurait
|
||||||
|
aurions
|
||||||
|
auriez
|
||||||
|
auraient
|
||||||
|
avais
|
||||||
|
avait
|
||||||
|
avions
|
||||||
|
aviez
|
||||||
|
avaient
|
||||||
|
eut
|
||||||
|
eûmes
|
||||||
|
eûtes
|
||||||
|
eurent
|
||||||
|
aie
|
||||||
|
aies
|
||||||
|
ait
|
||||||
|
ayons
|
||||||
|
ayez
|
||||||
|
aient
|
||||||
|
eusse
|
||||||
|
eusses
|
||||||
|
eût
|
||||||
|
eussions
|
||||||
|
eussiez
|
||||||
|
eussent
|
||||||
|
ceci
|
||||||
|
celà
|
||||||
|
cet
|
||||||
|
cette
|
||||||
|
ici
|
||||||
|
ils
|
||||||
|
les
|
||||||
|
leurs
|
||||||
|
quel
|
||||||
|
quels
|
||||||
|
quelle
|
||||||
|
quelles
|
||||||
|
sans
|
||||||
|
soi
|
@ -185,6 +185,7 @@ mod tests {
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
use hashbrown::HashSet;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||||
@ -194,6 +195,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||||
let dir = tempdir()?;
|
let dir = tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||||
|
|
||||||
@ -237,8 +239,8 @@ mod tests {
|
|||||||
let tokenizer_builder = DefaultBuilder::new();
|
let tokenizer_builder = DefaultBuilder::new();
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||||
|
|
||||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||||
|
|
||||||
builder.build()?
|
builder.build()?
|
||||||
};
|
};
|
||||||
@ -258,6 +260,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||||
let dir = tempdir()?;
|
let dir = tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||||
|
|
||||||
@ -312,8 +315,8 @@ mod tests {
|
|||||||
let update_path = dir.path().join("update-000.sst");
|
let update_path = dir.path().join("update-000.sst");
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||||
|
|
||||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||||
|
|
||||||
builder.build()?
|
builder.build()?
|
||||||
};
|
};
|
||||||
@ -325,8 +328,8 @@ mod tests {
|
|||||||
let update_path = dir.path().join("update-001.sst");
|
let update_path = dir.path().join("update-001.sst");
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||||
|
|
||||||
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
|
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||||
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
|
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||||
|
|
||||||
builder.build()?
|
builder.build()?
|
||||||
};
|
};
|
||||||
@ -364,8 +367,9 @@ mod bench {
|
|||||||
use rand::distributions::Alphanumeric;
|
use rand::distributions::Alphanumeric;
|
||||||
use rand_xorshift::XorShiftRng;
|
use rand_xorshift::XorShiftRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
use rand::seq::SliceRandom;
|
|
||||||
use serde_derive::Serialize;
|
use serde_derive::Serialize;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use hashbrown::HashSet;
|
||||||
|
|
||||||
use crate::tokenizer::DefaultBuilder;
|
use crate::tokenizer::DefaultBuilder;
|
||||||
use crate::database::update::UpdateBuilder;
|
use crate::database::update::UpdateBuilder;
|
||||||
@ -394,6 +398,7 @@ mod bench {
|
|||||||
#[bench]
|
#[bench]
|
||||||
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -421,7 +426,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
@ -440,6 +445,7 @@ mod bench {
|
|||||||
#[bench]
|
#[bench]
|
||||||
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -467,7 +473,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
@ -487,6 +493,7 @@ mod bench {
|
|||||||
#[ignore]
|
#[ignore]
|
||||||
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -514,7 +521,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
@ -533,6 +540,7 @@ mod bench {
|
|||||||
#[bench]
|
#[bench]
|
||||||
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -560,7 +568,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
@ -579,6 +587,7 @@ mod bench {
|
|||||||
#[bench]
|
#[bench]
|
||||||
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -606,7 +615,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
@ -626,6 +635,7 @@ mod bench {
|
|||||||
#[ignore]
|
#[ignore]
|
||||||
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let mut builder = SchemaBuilder::with_identifier("id");
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
@ -653,7 +663,7 @@ mod bench {
|
|||||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||||
};
|
};
|
||||||
builder.update_document(&document, &tokenizer_builder)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let update = builder.build()?;
|
||||||
|
@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
|
|||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::Token;
|
||||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||||
|
|
||||||
|
use hashbrown::HashSet;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
|
||||||
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
|
|||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub attribute: SchemaAttr,
|
pub attribute: SchemaAttr,
|
||||||
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||||
@ -48,6 +50,7 @@ where B: TokenizerBuilder
|
|||||||
|
|
||||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||||
|
|
||||||
// FIXME must u32::try_from instead
|
// FIXME must u32::try_from instead
|
||||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||||
Ok(attribute) => attribute,
|
Ok(attribute) => attribute,
|
||||||
@ -69,6 +72,8 @@ where B: TokenizerBuilder
|
|||||||
// insert the exact representation
|
// insert the exact representation
|
||||||
let word_lower = word.to_lowercase();
|
let word_lower = word.to_lowercase();
|
||||||
|
|
||||||
|
if self.stop_words.contains(&word_lower) { continue }
|
||||||
|
|
||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use hashbrown::HashSet;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
|
|||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||||
@ -139,6 +141,7 @@ where B: TokenizerBuilder
|
|||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
update: self.update,
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
|
stop_words: self.stop_words,
|
||||||
current_key_name: None,
|
current_key_name: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -154,6 +157,7 @@ where B: TokenizerBuilder
|
|||||||
update: self.update,
|
update: self.update,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
|
stop_words: self.stop_words,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
|
|||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
|
pub stop_words: &'a HashSet<String>,
|
||||||
pub current_key_name: Option<String>,
|
pub current_key_name: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,6 +224,7 @@ where B: TokenizerBuilder
|
|||||||
tokenizer_builder: self.tokenizer_builder,
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
attribute: attr,
|
attribute: attr,
|
||||||
|
stop_words: self.stop_words,
|
||||||
};
|
};
|
||||||
value.serialize(serializer)?;
|
value.serialize(serializer)?;
|
||||||
}
|
}
|
||||||
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
|
|||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||||
@ -264,6 +271,7 @@ where B: TokenizerBuilder
|
|||||||
tokenizer_builder: self.tokenizer_builder,
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
attribute: attr,
|
attribute: attr,
|
||||||
|
stop_words: self.stop_words,
|
||||||
};
|
};
|
||||||
value.serialize(serializer)?;
|
value.serialize(serializer)?;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
|
use hashbrown::HashSet;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
use crate::database::serde::serializer::Serializer;
|
use crate::database::serde::serializer::Serializer;
|
||||||
@ -28,6 +29,7 @@ impl UpdateBuilder {
|
|||||||
&mut self,
|
&mut self,
|
||||||
document: T,
|
document: T,
|
||||||
tokenizer_builder: &B,
|
tokenizer_builder: &B,
|
||||||
|
stop_words: &HashSet<String>,
|
||||||
) -> Result<DocumentId, SerializerError>
|
) -> Result<DocumentId, SerializerError>
|
||||||
where T: Serialize,
|
where T: Serialize,
|
||||||
B: TokenizerBuilder,
|
B: TokenizerBuilder,
|
||||||
@ -40,6 +42,7 @@ impl UpdateBuilder {
|
|||||||
document_id: document_id,
|
document_id: document_id,
|
||||||
tokenizer_builder: tokenizer_builder,
|
tokenizer_builder: tokenizer_builder,
|
||||||
update: update,
|
update: update,
|
||||||
|
stop_words: stop_words,
|
||||||
};
|
};
|
||||||
|
|
||||||
document.serialize(serializer)?;
|
document.serialize(serializer)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user