Merge pull request #67 from Kerollmops/reintroduce-stop-words

Reintroduce stop words
This commit is contained in:
Clément Renault 2019-01-07 13:29:23 +01:00 committed by GitHub
commit 8c781a4d05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 280 additions and 36 deletions

View File

@ -1,12 +1,13 @@
#[global_allocator] #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::error::Error; use std::error::Error;
use std::borrow::Cow; use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use hashbrown::HashMap; use hashbrown::{HashMap, HashSet};
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
@ -26,6 +27,13 @@ pub struct Opt {
/// The path to the schema. /// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))] #[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf, pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@ -34,37 +42,75 @@ struct Document<'a> (
HashMap<Cow<'a, str>, Cow<'a, str>> HashMap<Cow<'a, str>, Cow<'a, str>>
); );
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> { fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path, &schema)?; let database = Database::create(database_path, &schema)?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new(); let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone(); let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? { let mut i = 0;
let document: Document = match raw_record.deserialize(Some(&headers)) { let mut end_of_file = false;
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder)?; while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("building update...");
let update = update.build()?;
println!("ingesting update...");
database.ingest_update_file(update)?;
} }
let update = update.build()?;
database.ingest_update_file(update)?;
Ok(database) Ok(database)
} }
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> { fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init(); let _ = env_logger::init();
let opt = Opt::from_args(); let opt = Opt::from_args();
@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
Schema::from_toml(file)? Schema::from_toml(file)?
}; };
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| { let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path) index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
}); });
if let Err(e) = result { if let Err(e) = result {

View File

@ -95,7 +95,8 @@ or
other other
ought ought
our our
ours ourselves ours
ourselves
out out
over over
own own

163
misc/fr.stopwords.txt Normal file
View File

@ -0,0 +1,163 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
je
la
le
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
ceci
celà
cet
cette
ici
ils
les
leurs
quel
quels
quelle
quelles
sans
soi

View File

@ -7,6 +7,7 @@ use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamil
use rocksdb::rocksdb::{Writable, Snapshot}; use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, DBVector, MergeOperands}; use rocksdb::{DB, DBVector, MergeOperands};
use crossbeam::atomic::ArcCell; use crossbeam::atomic::ArcCell;
use log::debug;
pub use self::document_key::{DocumentKey, DocumentKeyAttr}; pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::view::{DatabaseView, DocumentIter}; pub use self::view::{DatabaseView, DocumentIter};
@ -147,9 +148,11 @@ impl Database {
let options = IngestExternalFileOptions::new(); let options = IngestExternalFileOptions::new();
// options.move_files(move_update); // options.move_files(move_update);
debug!("ingest update file");
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found"); let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?; db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
debug!("compacting index range");
// Compacting to trigger the merge operator only one time // Compacting to trigger the merge operator only one time
// while ingesting the update and not each time searching // while ingesting the update and not each time searching
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX)); db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
@ -185,6 +188,7 @@ mod tests {
use std::error::Error; use std::error::Error;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use hashbrown::HashSet;
use tempfile::tempdir; use tempfile::tempdir;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
@ -194,6 +198,7 @@ mod tests {
#[test] #[test]
fn ingest_one_update_file() -> Result<(), Box<Error>> { fn ingest_one_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?; let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb"); let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -237,8 +242,8 @@ mod tests {
let tokenizer_builder = DefaultBuilder::new(); let tokenizer_builder = DefaultBuilder::new();
let mut builder = UpdateBuilder::new(update_path, schema); let mut builder = UpdateBuilder::new(update_path, schema);
docid0 = builder.update_document(&doc0, &tokenizer_builder)?; docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?; docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()? builder.build()?
}; };
@ -258,6 +263,7 @@ mod tests {
#[test] #[test]
fn ingest_two_update_files() -> Result<(), Box<Error>> { fn ingest_two_update_files() -> Result<(), Box<Error>> {
let dir = tempdir()?; let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb"); let rocksdb_path = dir.path().join("rocksdb.rdb");
@ -312,8 +318,8 @@ mod tests {
let update_path = dir.path().join("update-000.sst"); let update_path = dir.path().join("update-000.sst");
let mut builder = UpdateBuilder::new(update_path, schema.clone()); let mut builder = UpdateBuilder::new(update_path, schema.clone());
docid0 = builder.update_document(&doc0, &tokenizer_builder)?; docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder)?; docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()? builder.build()?
}; };
@ -325,8 +331,8 @@ mod tests {
let update_path = dir.path().join("update-001.sst"); let update_path = dir.path().join("update-001.sst");
let mut builder = UpdateBuilder::new(update_path, schema); let mut builder = UpdateBuilder::new(update_path, schema);
docid2 = builder.update_document(&doc2, &tokenizer_builder)?; docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder)?; docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
builder.build()? builder.build()?
}; };
@ -364,8 +370,9 @@ mod bench {
use rand::distributions::Alphanumeric; use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng; use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use rand::seq::SliceRandom;
use serde_derive::Serialize; use serde_derive::Serialize;
use rand::seq::SliceRandom;
use hashbrown::HashSet;
use crate::tokenizer::DefaultBuilder; use crate::tokenizer::DefaultBuilder;
use crate::database::update::UpdateBuilder; use crate::database::update::UpdateBuilder;
@ -394,6 +401,7 @@ mod bench {
#[bench] #[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -421,7 +429,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;
@ -440,6 +448,7 @@ mod bench {
#[bench] #[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -467,7 +476,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;
@ -487,6 +496,7 @@ mod bench {
#[ignore] #[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -514,7 +524,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;
@ -533,6 +543,7 @@ mod bench {
#[bench] #[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -560,7 +571,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;
@ -579,6 +590,7 @@ mod bench {
#[bench] #[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -606,7 +618,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;
@ -626,6 +638,7 @@ mod bench {
#[ignore] #[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> { fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
@ -653,7 +666,7 @@ mod bench {
title: random_sentences(rng.gen_range(1, 8), &mut rng), title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng),
}; };
builder.update_document(&document, &tokenizer_builder)?; builder.update_document(&document, &tokenizer_builder, &stop_words)?;
} }
let update = builder.build()?; let update = builder.build()?;

View File

@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token; use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea}; use crate::{DocumentId, DocIndex, Attribute, WordArea};
use hashbrown::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
pub update: &'a mut DocumentUpdate, pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId, pub document_id: DocumentId,
pub attribute: SchemaAttr, pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
} }
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
@ -48,6 +50,7 @@ where B: TokenizerBuilder
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
// FIXME must u32::try_from instead // FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) { let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute, Ok(attribute) => attribute,
@ -69,6 +72,8 @@ where B: TokenizerBuilder
// insert the exact representation // insert the exact representation
let word_lower = word.to_lowercase(); let word_lower = word.to_lowercase();
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version // and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded { if word_lower != word_unidecoded {

View File

@ -1,3 +1,4 @@
use hashbrown::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
pub update: &'a mut DocumentUpdate, pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId, pub document_id: DocumentId,
pub tokenizer_builder: &'a B, pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
} }
impl<'a, B> ser::Serializer for Serializer<'a, B> impl<'a, B> ser::Serializer for Serializer<'a, B>
@ -139,6 +141,7 @@ where B: TokenizerBuilder
document_id: self.document_id, document_id: self.document_id,
update: self.update, update: self.update,
tokenizer_builder: self.tokenizer_builder, tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None, current_key_name: None,
}) })
} }
@ -154,6 +157,7 @@ where B: TokenizerBuilder
update: self.update, update: self.update,
document_id: self.document_id, document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder, tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
}) })
} }
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
pub document_id: DocumentId, pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate, pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B, pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>, pub current_key_name: Option<String>,
} }
@ -219,6 +224,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder, tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id, document_id: self.document_id,
attribute: attr, attribute: attr,
stop_words: self.stop_words,
}; };
value.serialize(serializer)?; value.serialize(serializer)?;
} }
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
pub document_id: DocumentId, pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate, pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B, pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
} }
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
@ -264,6 +271,7 @@ where B: TokenizerBuilder
tokenizer_builder: self.tokenizer_builder, tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id, document_id: self.document_id,
attribute: attr, attribute: attr,
stop_words: self.stop_words,
}; };
value.serialize(serializer)?; value.serialize(serializer)?;
} }

View File

@ -1,6 +1,7 @@
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use hashbrown::HashSet;
use serde::Serialize; use serde::Serialize;
use crate::database::serde::serializer::Serializer; use crate::database::serde::serializer::Serializer;
@ -28,6 +29,7 @@ impl UpdateBuilder {
&mut self, &mut self,
document: T, document: T,
tokenizer_builder: &B, tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError> ) -> Result<DocumentId, SerializerError>
where T: Serialize, where T: Serialize,
B: TokenizerBuilder, B: TokenizerBuilder,
@ -40,6 +42,7 @@ impl UpdateBuilder {
document_id: document_id, document_id: document_id,
tokenizer_builder: tokenizer_builder, tokenizer_builder: tokenizer_builder,
update: update, update: update,
stop_words: stop_words,
}; };
document.serialize(serializer)?; document.serialize(serializer)?;

View File

@ -18,7 +18,7 @@ fn sum_matches_typos(matches: &[Match]) -> isize {
// note that GroupBy will never return an empty group // note that GroupBy will never return an empty group
// so we can do this assumption safely // so we can do this assumption safely
for group in GroupBy::new(matches, match_query_index) { for group in GroupBy::new(matches, match_query_index) {
sum_typos += unsafe { group.get_unchecked(0).distance } as isize; sum_typos += unsafe { group.get_unchecked(0).distance as isize };
number_words += 1; number_words += 1;
} }