mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 05:30:16 +01:00
Introduce the stop words addition update type
This commit is contained in:
parent
32d2cc3aea
commit
776673ebae
@ -9,6 +9,7 @@ const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
|||||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||||
const SCHEMA_KEY: &str = "schema";
|
const SCHEMA_KEY: &str = "schema";
|
||||||
const SYNONYMS_KEY: &str = "synonyms";
|
const SYNONYMS_KEY: &str = "synonyms";
|
||||||
|
const STOP_WORDS_KEY: &str = "stop-words";
|
||||||
const WORDS_KEY: &str = "words";
|
const WORDS_KEY: &str = "words";
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
@ -71,6 +72,24 @@ impl Main {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||||
|
let bytes = fst.as_fst().as_bytes();
|
||||||
|
self.main
|
||||||
|
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
|
||||||
|
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
|
||||||
|
Some(bytes) => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let bytes = Arc::from(bytes);
|
||||||
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
|
Ok(Some(fst::Set::from(fst)))
|
||||||
|
}
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
|
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
|
||||||
where
|
where
|
||||||
F: Fn(u64) -> u64,
|
F: Fn(u64) -> u64,
|
||||||
|
@ -187,6 +187,14 @@ impl Index {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
|
||||||
|
update::StopWordsAddition::new(
|
||||||
|
self.updates,
|
||||||
|
self.updates_results,
|
||||||
|
self.updates_notifier.clone(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
|
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
|
||||||
match self.updates.last_update_id(reader)? {
|
match self.updates.last_update_id(reader)? {
|
||||||
Some((id, _)) => Ok(Some(id)),
|
Some((id, _)) => Ok(Some(id)),
|
||||||
|
@ -3,6 +3,7 @@ mod customs_update;
|
|||||||
mod documents_addition;
|
mod documents_addition;
|
||||||
mod documents_deletion;
|
mod documents_deletion;
|
||||||
mod schema_update;
|
mod schema_update;
|
||||||
|
mod stop_words_addition;
|
||||||
mod synonyms_addition;
|
mod synonyms_addition;
|
||||||
mod synonyms_deletion;
|
mod synonyms_deletion;
|
||||||
|
|
||||||
@ -11,11 +12,12 @@ pub use self::customs_update::{apply_customs_update, push_customs_update};
|
|||||||
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||||
|
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
|
||||||
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
||||||
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
||||||
|
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::{BTreeMap, BTreeSet};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
@ -34,6 +36,7 @@ pub enum Update {
|
|||||||
DocumentsDeletion(Vec<DocumentId>),
|
DocumentsDeletion(Vec<DocumentId>),
|
||||||
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
||||||
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
||||||
|
StopWordsAddition(BTreeSet<String>),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -45,6 +48,7 @@ pub enum UpdateType {
|
|||||||
DocumentsDeletion { number: usize },
|
DocumentsDeletion { number: usize },
|
||||||
SynonymsAddition { number: usize },
|
SynonymsAddition { number: usize },
|
||||||
SynonymsDeletion { number: usize },
|
SynonymsDeletion { number: usize },
|
||||||
|
StopWordsAddition { number: usize },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -210,6 +214,18 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
|||||||
|
|
||||||
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
||||||
|
|
||||||
|
(update_type, result, start.elapsed())
|
||||||
|
}
|
||||||
|
Update::StopWordsAddition(stop_words) => {
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let update_type = UpdateType::StopWordsAddition {
|
||||||
|
number: stop_words.len(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result =
|
||||||
|
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
116
meilidb-core/src/update/stop_words_addition.rs
Normal file
116
meilidb-core/src/update/stop_words_addition.rs
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
|
|
||||||
|
use crate::automaton::normalize_str;
|
||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use crate::{store, MResult};
|
||||||
|
|
||||||
|
pub struct StopWordsAddition {
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
stop_words: BTreeSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StopWordsAddition {
|
||||||
|
pub fn new(
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
) -> StopWordsAddition {
|
||||||
|
StopWordsAddition {
|
||||||
|
updates_store,
|
||||||
|
updates_results_store,
|
||||||
|
updates_notifier,
|
||||||
|
stop_words: BTreeSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
|
||||||
|
let stop_word = normalize_str(stop_word.as_ref());
|
||||||
|
self.stop_words.insert(stop_word);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||||
|
let _ = self.updates_notifier.send(());
|
||||||
|
let update_id = push_stop_words_addition(
|
||||||
|
writer,
|
||||||
|
self.updates_store,
|
||||||
|
self.updates_results_store,
|
||||||
|
self.stop_words,
|
||||||
|
)?;
|
||||||
|
Ok(update_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push_stop_words_addition(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
updates_store: store::Updates,
|
||||||
|
updates_results_store: store::UpdatesResults,
|
||||||
|
addition: BTreeSet<String>,
|
||||||
|
) -> MResult<u64> {
|
||||||
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
|
let update = Update::StopWordsAddition(addition);
|
||||||
|
updates_store.put_update(writer, last_update_id, &update)?;
|
||||||
|
|
||||||
|
Ok(last_update_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn apply_stop_words_addition(
|
||||||
|
writer: &mut heed::RwTxn,
|
||||||
|
main_store: store::Main,
|
||||||
|
postings_lists_store: store::PostingsLists,
|
||||||
|
addition: BTreeSet<String>,
|
||||||
|
) -> MResult<()> {
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
|
for word in addition {
|
||||||
|
stop_words_builder.insert(&word).unwrap();
|
||||||
|
// we remove every posting list associated to a new stop word
|
||||||
|
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the new delta stop words fst
|
||||||
|
let delta_stop_words = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// we also need to remove all the stop words from the main fst
|
||||||
|
if let Some(word_fst) = main_store.words_fst(writer)? {
|
||||||
|
let op = OpBuilder::new()
|
||||||
|
.add(&word_fst)
|
||||||
|
.add(&delta_stop_words)
|
||||||
|
.difference();
|
||||||
|
|
||||||
|
let mut word_fst_builder = SetBuilder::memory();
|
||||||
|
word_fst_builder.extend_stream(op).unwrap();
|
||||||
|
let word_fst = word_fst_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
main_store.put_words_fst(writer, &word_fst)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we add all of these stop words to the main store
|
||||||
|
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let op = OpBuilder::new()
|
||||||
|
.add(&stop_words_fst)
|
||||||
|
.add(&delta_stop_words)
|
||||||
|
.r#union();
|
||||||
|
|
||||||
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
stop_words_builder.extend_stream(op).unwrap();
|
||||||
|
let stop_words_fst = stop_words_builder
|
||||||
|
.into_inner()
|
||||||
|
.and_then(fst::Set::from_bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user