mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 21:20:24 +01:00
Merge pull request #244 from meilisearch/reintroduce-stop-words
Reintroduce stop words
This commit is contained in:
commit
41065305aa
@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
|
|||||||
|
|
||||||
pub struct RawIndexer {
|
pub struct RawIndexer {
|
||||||
word_limit: usize, // the maximum number of indexed words
|
word_limit: usize, // the maximum number of indexed words
|
||||||
|
stop_words: fst::Set,
|
||||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||||
}
|
}
|
||||||
@ -21,13 +22,14 @@ pub struct Indexed {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RawIndexer {
|
impl RawIndexer {
|
||||||
pub fn new() -> RawIndexer {
|
pub fn new(stop_words: fst::Set) -> RawIndexer {
|
||||||
RawIndexer::with_word_limit(1000)
|
RawIndexer::with_word_limit(stop_words, 1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_word_limit(limit: usize) -> RawIndexer {
|
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
|
||||||
RawIndexer {
|
RawIndexer {
|
||||||
word_limit: limit,
|
word_limit: limit,
|
||||||
|
stop_words,
|
||||||
words_doc_indexes: BTreeMap::new(),
|
words_doc_indexes: BTreeMap::new(),
|
||||||
docs_words: HashMap::new(),
|
docs_words: HashMap::new(),
|
||||||
}
|
}
|
||||||
@ -56,6 +58,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -87,6 +90,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -118,6 +122,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -152,17 +157,12 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for RawIndexer {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn index_token(
|
fn index_token(
|
||||||
token: Token,
|
token: Token,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
attr: SchemaAttr,
|
attr: SchemaAttr,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
|
stop_words: &fst::Set,
|
||||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
@ -170,6 +170,7 @@ fn index_token(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !stop_words.contains(&token.word) {
|
||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
@ -181,6 +182,7 @@ fn index_token(
|
|||||||
}
|
}
|
||||||
None => return false,
|
None => return false,
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
@ -207,7 +209,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn strange_apostrophe() {
|
fn strange_apostrophe() {
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||||
|
|
||||||
let docid = DocumentId(0);
|
let docid = DocumentId(0);
|
||||||
let attr = SchemaAttr(0);
|
let attr = SchemaAttr(0);
|
||||||
@ -231,7 +233,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn strange_apostrophe_in_sequence() {
|
fn strange_apostrophe_in_sequence() {
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||||
|
|
||||||
let docid = DocumentId(0);
|
let docid = DocumentId(0);
|
||||||
let attr = SchemaAttr(0);
|
let attr = SchemaAttr(0);
|
||||||
@ -252,4 +254,33 @@ mod tests {
|
|||||||
.get(&"l’éteindre".to_owned().into_bytes())
|
.get(&"l’éteindre".to_owned().into_bytes())
|
||||||
.is_some());
|
.is_some());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn basic_stop_words() {
|
||||||
|
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
|
||||||
|
let stop_words = fst::Set::from_iter(stop_words).unwrap();
|
||||||
|
|
||||||
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
|
|
||||||
|
let docid = DocumentId(0);
|
||||||
|
let attr = SchemaAttr(0);
|
||||||
|
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||||
|
indexer.index_text(docid, attr, text);
|
||||||
|
|
||||||
|
let Indexed {
|
||||||
|
words_doc_indexes, ..
|
||||||
|
} = indexer.build();
|
||||||
|
|
||||||
|
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
|
||||||
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||||
|
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
|
||||||
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
||||||
|
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
||||||
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
|
|
||||||
|
// with the ugly apostrophe...
|
||||||
|
assert!(words_doc_indexes
|
||||||
|
.get(&"l’éteindre".to_owned().into_bytes())
|
||||||
|
.is_some());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
|||||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||||
const SCHEMA_KEY: &str = "schema";
|
const SCHEMA_KEY: &str = "schema";
|
||||||
const SYNONYMS_KEY: &str = "synonyms";
|
const SYNONYMS_KEY: &str = "synonyms";
|
||||||
|
const STOP_WORDS_KEY: &str = "stop-words";
|
||||||
const WORDS_KEY: &str = "words";
|
const WORDS_KEY: &str = "words";
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
@ -71,6 +72,24 @@ impl Main {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||||
|
let bytes = fst.as_fst().as_bytes();
|
||||||
|
self.main
|
||||||
|
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
|
||||||
|
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
|
||||||
|
Some(bytes) => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let bytes = Arc::from(bytes);
|
||||||
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
|
Ok(Some(fst::Set::from(fst)))
|
||||||
|
}
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
|
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
|
||||||
where
|
where
|
||||||
F: Fn(u64) -> u64,
|
F: Fn(u64) -> u64,
|
||||||
|
@ -187,6 +187,22 @@ impl Index {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
|
||||||
|
update::StopWordsAddition::new(
|
||||||
|
self.updates,
|
||||||
|
self.updates_results,
|
||||||
|
self.updates_notifier.clone(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
|
||||||
|
update::StopWordsDeletion::new(
|
||||||
|
self.updates,
|
||||||
|
self.updates_results,
|
||||||
|
self.updates_notifier.clone(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
|
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
|
||||||
match self.updates.last_update_id(reader)? {
|
match self.updates.last_update_id(reader)? {
|
||||||
Some((id, _)) => Ok(Some(id)),
|
Some((id, _)) => Ok(Some(id)),
|
||||||
|
@ -87,7 +87,6 @@ pub fn apply_documents_addition(
|
|||||||
addition: Vec<serde_json::Value>,
|
addition: Vec<serde_json::Value>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let mut documents_additions = HashMap::new();
|
let mut documents_additions = HashMap::new();
|
||||||
let mut indexer = RawIndexer::new();
|
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match main_store.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
@ -124,7 +123,14 @@ pub fn apply_documents_addition(
|
|||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||||
|
Some(stop_words) => stop_words,
|
||||||
|
None => fst::Set::default(),
|
||||||
|
};
|
||||||
|
|
||||||
// 3. index the documents fields in the stores
|
// 3. index the documents fields in the stores
|
||||||
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
|
|
||||||
for (document_id, document) in documents_additions {
|
for (document_id, document) in documents_additions {
|
||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
txn: writer,
|
txn: writer,
|
||||||
@ -180,8 +186,13 @@ pub fn reindex_all_documents(
|
|||||||
postings_lists_store.clear(writer)?;
|
postings_lists_store.clear(writer)?;
|
||||||
docs_words_store.clear(writer)?;
|
docs_words_store.clear(writer)?;
|
||||||
|
|
||||||
|
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||||
|
Some(stop_words) => stop_words,
|
||||||
|
None => fst::Set::default(),
|
||||||
|
};
|
||||||
|
|
||||||
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
let mut ram_store = HashMap::new();
|
let mut ram_store = HashMap::new();
|
||||||
|
|
||||||
for document_id in documents_ids_to_reindex {
|
for document_id in documents_ids_to_reindex {
|
||||||
|
@ -3,6 +3,8 @@ mod customs_update;
|
|||||||
mod documents_addition;
|
mod documents_addition;
|
||||||
mod documents_deletion;
|
mod documents_deletion;
|
||||||
mod schema_update;
|
mod schema_update;
|
||||||
|
mod stop_words_addition;
|
||||||
|
mod stop_words_deletion;
|
||||||
mod synonyms_addition;
|
mod synonyms_addition;
|
||||||
mod synonyms_deletion;
|
mod synonyms_deletion;
|
||||||
|
|
||||||
@ -11,11 +13,13 @@ pub use self::customs_update::{apply_customs_update, push_customs_update};
|
|||||||
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||||
|
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
|
||||||
|
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
|
||||||
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
||||||
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
||||||
|
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::{BTreeMap, BTreeSet};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
@ -34,6 +38,8 @@ pub enum Update {
|
|||||||
DocumentsDeletion(Vec<DocumentId>),
|
DocumentsDeletion(Vec<DocumentId>),
|
||||||
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
||||||
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
||||||
|
StopWordsAddition(BTreeSet<String>),
|
||||||
|
StopWordsDeletion(BTreeSet<String>),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -45,6 +51,8 @@ pub enum UpdateType {
|
|||||||
DocumentsDeletion { number: usize },
|
DocumentsDeletion { number: usize },
|
||||||
SynonymsAddition { number: usize },
|
SynonymsAddition { number: usize },
|
||||||
SynonymsDeletion { number: usize },
|
SynonymsDeletion { number: usize },
|
||||||
|
StopWordsAddition { number: usize },
|
||||||
|
StopWordsDeletion { number: usize },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -210,6 +218,37 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
|||||||
|
|
||||||
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
||||||
|
|
||||||
|
(update_type, result, start.elapsed())
|
||||||
|
}
|
||||||
|
Update::StopWordsAddition(stop_words) => {
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let update_type = UpdateType::StopWordsAddition {
|
||||||
|
number: stop_words.len(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result =
|
||||||
|
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
|
||||||
|
|
||||||
|
(update_type, result, start.elapsed())
|
||||||
|
}
|
||||||
|
Update::StopWordsDeletion(stop_words) => {
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let update_type = UpdateType::StopWordsDeletion {
|
||||||
|
number: stop_words.len(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = apply_stop_words_deletion(
|
||||||
|
writer,
|
||||||
|
index.main,
|
||||||
|
index.documents_fields,
|
||||||
|
index.documents_fields_counts,
|
||||||
|
index.postings_lists,
|
||||||
|
index.docs_words,
|
||||||
|
stop_words,
|
||||||
|
);
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
116
meilidb-core/src/update/stop_words_addition.rs
Normal file
116
meilidb-core/src/update/stop_words_addition.rs
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
|
|
||||||
|
use crate::automaton::normalize_str;
|
||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use crate::{store, MResult};
|
||||||
|
|
||||||
|
pub struct StopWordsAddition {
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
stop_words: BTreeSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StopWordsAddition {
|
||||||
|
pub fn new(
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
) -> StopWordsAddition {
|
||||||
|
StopWordsAddition {
|
||||||
|
updates_store,
|
||||||
|
updates_results_store,
|
||||||
|
updates_notifier,
|
||||||
|
stop_words: BTreeSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
|
||||||
|
let stop_word = normalize_str(stop_word.as_ref());
|
||||||
|
self.stop_words.insert(stop_word);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||||
|
let _ = self.updates_notifier.send(());
|
||||||
|
let update_id = push_stop_words_addition(
|
||||||
|
writer,
|
||||||
|
self.updates_store,
|
||||||
|
self.updates_results_store,
|
||||||
|
self.stop_words,
|
||||||
|
)?;
|
||||||
|
Ok(update_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push_stop_words_addition(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
addition: BTreeSet<String>,
|
||||||
|
) -> MResult<u64> {
|
||||||
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
|
let update = Update::StopWordsAddition(addition);
|
||||||
|
updates_store.put_update(writer, last_update_id, &update)?;
|
||||||
|
|
||||||
|
Ok(last_update_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn apply_stop_words_addition(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
main_store: store::Main,
|
||||||
|
postings_lists_store: store::PostingsLists,
|
||||||
|
addition: BTreeSet<String>,
|
||||||
|
) -> MResult<()> {
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
|
for word in addition {
|
||||||
|
stop_words_builder.insert(&word).unwrap();
|
||||||
|
// we remove every posting list associated to a new stop word
|
||||||
|
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the new delta stop words fst
|
||||||
|
let delta_stop_words = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// we also need to remove all the stop words from the main fst
|
||||||
|
if let Some(word_fst) = main_store.words_fst(writer)? {
|
||||||
|
let op = OpBuilder::new()
|
||||||
|
.add(&word_fst)
|
||||||
|
.add(&delta_stop_words)
|
||||||
|
.difference();
|
||||||
|
|
||||||
|
let mut word_fst_builder = SetBuilder::memory();
|
||||||
|
word_fst_builder.extend_stream(op).unwrap();
|
||||||
|
let word_fst = word_fst_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
main_store.put_words_fst(writer, &word_fst)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we add all of these stop words from the main store
|
||||||
|
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let op = OpBuilder::new()
|
||||||
|
.add(&stop_words_fst)
|
||||||
|
.add(&delta_stop_words)
|
||||||
|
.r#union();
|
||||||
|
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
stop_words_builder.extend_stream(op).unwrap();
|
||||||
|
let stop_words_fst = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
112
meilidb-core/src/update/stop_words_deletion.rs
Normal file
112
meilidb-core/src/update/stop_words_deletion.rs
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
|
|
||||||
|
use crate::automaton::normalize_str;
|
||||||
|
use crate::update::documents_addition::reindex_all_documents;
|
||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use crate::{store, MResult};
|
||||||
|
|
||||||
|
pub struct StopWordsDeletion {
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
stop_words: BTreeSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StopWordsDeletion {
|
||||||
|
pub fn new(
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
) -> StopWordsDeletion {
|
||||||
|
StopWordsDeletion {
|
||||||
|
updates_store,
|
||||||
|
updates_results_store,
|
||||||
|
updates_notifier,
|
||||||
|
stop_words: BTreeSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
|
||||||
|
let stop_word = normalize_str(stop_word.as_ref());
|
||||||
|
self.stop_words.insert(stop_word);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||||
|
let _ = self.updates_notifier.send(());
|
||||||
|
let update_id = push_stop_words_deletion(
|
||||||
|
writer,
|
||||||
|
self.updates_store,
|
||||||
|
self.updates_results_store,
|
||||||
|
self.stop_words,
|
||||||
|
)?;
|
||||||
|
Ok(update_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push_stop_words_deletion(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
deletion: BTreeSet<String>,
|
||||||
|
) -> MResult<u64> {
|
||||||
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
|
let update = Update::StopWordsDeletion(deletion);
|
||||||
|
updates_store.put_update(writer, last_update_id, &update)?;
|
||||||
|
|
||||||
|
Ok(last_update_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn apply_stop_words_deletion(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
main_store: store::Main,
|
||||||
|
documents_fields_store: store::DocumentsFields,
|
||||||
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
|
postings_lists_store: store::PostingsLists,
|
||||||
|
docs_words_store: store::DocsWords,
|
||||||
|
deletion: BTreeSet<String>,
|
||||||
|
) -> MResult<()> {
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
|
for word in deletion {
|
||||||
|
stop_words_builder.insert(&word).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the new delta stop words fst
|
||||||
|
let delta_stop_words = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// now we delete all of these stop words from the main store
|
||||||
|
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let op = OpBuilder::new()
|
||||||
|
.add(&stop_words_fst)
|
||||||
|
.add(&delta_stop_words)
|
||||||
|
.difference();
|
||||||
|
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
stop_words_builder.extend_stream(op).unwrap();
|
||||||
|
let stop_words_fst = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
|
// now that we have setup the stop words
|
||||||
|
// lets reindex everything...
|
||||||
|
reindex_all_documents(
|
||||||
|
writer,
|
||||||
|
main_store,
|
||||||
|
documents_fields_store,
|
||||||
|
documents_fields_counts_store,
|
||||||
|
postings_lists_store,
|
||||||
|
docs_words_store,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user