mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 12:27:13 +02:00
feat: Reintroduce stopwords for the serializer
This commit is contained in:
parent
a2f5e8aa25
commit
32f8908d71
7 changed files with 276 additions and 35 deletions
|
@ -185,6 +185,7 @@ mod tests {
|
|||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::HashSet;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
|
@ -194,6 +195,7 @@ mod tests {
|
|||
#[test]
|
||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
|
@ -237,8 +239,8 @@ mod tests {
|
|||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
@ -258,6 +260,7 @@ mod tests {
|
|||
#[test]
|
||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
|
@ -312,8 +315,8 @@ mod tests {
|
|||
let update_path = dir.path().join("update-000.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
@ -325,8 +328,8 @@ mod tests {
|
|||
let update_path = dir.path().join("update-001.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
@ -364,8 +367,9 @@ mod bench {
|
|||
use rand::distributions::Alphanumeric;
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rand::seq::SliceRandom;
|
||||
use serde_derive::Serialize;
|
||||
use rand::seq::SliceRandom;
|
||||
use hashbrown::HashSet;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::UpdateBuilder;
|
||||
|
@ -394,6 +398,7 @@ mod bench {
|
|||
#[bench]
|
||||
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -421,7 +426,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -440,6 +445,7 @@ mod bench {
|
|||
#[bench]
|
||||
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -467,7 +473,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -487,6 +493,7 @@ mod bench {
|
|||
#[ignore]
|
||||
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -514,7 +521,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -533,6 +540,7 @@ mod bench {
|
|||
#[bench]
|
||||
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -560,7 +568,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -579,6 +587,7 @@ mod bench {
|
|||
#[bench]
|
||||
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -606,7 +615,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
@ -626,6 +635,7 @@ mod bench {
|
|||
#[ignore]
|
||||
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
|
@ -653,7 +663,7 @@ mod bench {
|
|||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder)?;
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
|
|
|
@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder;
|
|||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
|
@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> {
|
|||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
|
@ -48,6 +50,7 @@ where B: TokenizerBuilder
|
|||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||
Ok(attribute) => attribute,
|
||||
|
@ -69,6 +72,8 @@ where B: TokenizerBuilder
|
|||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
if self.stop_words.contains(&word_lower) { continue }
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
|
@ -14,6 +15,7 @@ pub struct Serializer<'a, B> {
|
|||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
|
@ -139,6 +141,7 @@ where B: TokenizerBuilder
|
|||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
@ -154,6 +157,7 @@ where B: TokenizerBuilder
|
|||
update: self.update,
|
||||
document_id: self.document_id,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> {
|
|||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
|
@ -219,6 +224,7 @@ where B: TokenizerBuilder
|
|||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
|
@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> {
|
|||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
|
@ -264,6 +271,7 @@ where B: TokenizerBuilder
|
|||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
|
@ -28,6 +29,7 @@ impl UpdateBuilder {
|
|||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
|
@ -40,6 +42,7 @@ impl UpdateBuilder {
|
|||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: update,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue