Rewrite the stop-words endpoint; fix #417

This commit is contained in:
qdequele 2020-01-02 16:30:34 +01:00
parent f0590d3301
commit 91c6539baf
No known key found for this signature in database
GPG Key ID: B3F0A000EBF11745
6 changed files with 135 additions and 184 deletions

View File

@ -287,16 +287,8 @@ impl Index {
) )
} }
pub fn stop_words_addition(&self) -> update::StopWordsAddition { pub fn stop_words_update(&self) -> update::StopWordsUpdate {
update::StopWordsAddition::new( update::StopWordsUpdate::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
update::StopWordsDeletion::new(
self.updates, self.updates,
self.updates_results, self.updates_results,
self.updates_notifier.clone(), self.updates_notifier.clone(),

View File

@ -3,8 +3,7 @@ mod customs_update;
mod documents_addition; mod documents_addition;
mod documents_deletion; mod documents_deletion;
mod schema_update; mod schema_update;
mod stop_words_addition; mod stop_words_update;
mod stop_words_deletion;
mod synonyms_update; mod synonyms_update;
pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::clear_all::{apply_clear_all, push_clear_all};
@ -14,8 +13,7 @@ pub use self::documents_addition::{
}; };
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update}; pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition}; pub use self::stop_words_update::{apply_stop_words_update, StopWordsUpdate};
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
pub use self::synonyms_update::{apply_synonyms_update, SynonymsUpdate}; pub use self::synonyms_update::{apply_synonyms_update, SynonymsUpdate};
use std::cmp; use std::cmp;
@ -89,16 +87,9 @@ impl Update {
} }
} }
fn stop_words_addition(data: BTreeSet<String>) -> Update { fn stop_words_update(data: BTreeSet<String>) -> Update {
Update { Update {
data: UpdateData::StopWordsAddition(data), data: UpdateData::StopWordsUpdate(data),
enqueued_at: Utc::now(),
}
}
fn stop_words_deletion(data: BTreeSet<String>) -> Update {
Update {
data: UpdateData::StopWordsDeletion(data),
enqueued_at: Utc::now(), enqueued_at: Utc::now(),
} }
} }
@ -113,8 +104,7 @@ pub enum UpdateData {
DocumentsPartial(Vec<HashMap<String, serde_json::Value>>), DocumentsPartial(Vec<HashMap<String, serde_json::Value>>),
DocumentsDeletion(Vec<DocumentId>), DocumentsDeletion(Vec<DocumentId>),
SynonymsUpdate(BTreeMap<String, Vec<String>>), SynonymsUpdate(BTreeMap<String, Vec<String>>),
StopWordsAddition(BTreeSet<String>), StopWordsUpdate(BTreeSet<String>),
StopWordsDeletion(BTreeSet<String>),
} }
impl UpdateData { impl UpdateData {
@ -135,11 +125,8 @@ impl UpdateData {
UpdateData::SynonymsUpdate(addition) => UpdateType::SynonymsUpdate { UpdateData::SynonymsUpdate(addition) => UpdateType::SynonymsUpdate {
number: addition.len(), number: addition.len(),
}, },
UpdateData::StopWordsAddition(addition) => UpdateType::StopWordsAddition { UpdateData::StopWordsUpdate(update) => UpdateType::StopWordsUpdate {
number: addition.len(), number: update.len(),
},
UpdateData::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
number: deletion.len(),
}, },
} }
} }
@ -155,8 +142,7 @@ pub enum UpdateType {
DocumentsPartial { number: usize }, DocumentsPartial { number: usize },
DocumentsDeletion { number: usize }, DocumentsDeletion { number: usize },
SynonymsUpdate { number: usize }, SynonymsUpdate { number: usize },
StopWordsAddition { number: usize }, StopWordsUpdate { number: usize },
StopWordsDeletion { number: usize },
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -321,22 +307,10 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
} }
UpdateData::StopWordsAddition(stop_words) => { UpdateData::StopWordsUpdate(stop_words) => {
let start = Instant::now(); let start = Instant::now();
let update_type = UpdateType::StopWordsAddition { let update_type = UpdateType::StopWordsUpdate {
number: stop_words.len(),
};
let result =
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
(update_type, result, start.elapsed())
}
UpdateData::StopWordsDeletion(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsDeletion {
number: stop_words.len(), number: stop_words.len(),
}; };

View File

@ -1,107 +1 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::database::{MainT, UpdateT};
use crate::automaton::normalize_str;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
stop_words: BTreeSet<String>,
}
impl StopWordsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> StopWordsDeletion {
StopWordsDeletion {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_stop_words_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_deletion(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::stop_words_deletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_deletion(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word).unwrap();
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// now we delete all of these stop words from the main store
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
index.main.put_stop_words_fst(writer, &stop_words_fst)?;
// now that we have setup the stop words
// lets reindex everything...
if let Ok(number) = index.main.number_of_documents(writer) {
if number > 0 {
reindex_all_documents(writer, index)?;
}
}
Ok(())
}

View File

@ -2,26 +2,27 @@ use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder}; use fst::{set::OpBuilder, SetBuilder};
use crate::database::{MainT, UpdateT};
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
use crate::{store, MResult}; use crate::{store, MResult};
pub struct StopWordsAddition { pub struct StopWordsUpdate {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter, updates_notifier: UpdateEventsEmitter,
stop_words: BTreeSet<String>, stop_words: BTreeSet<String>,
} }
impl StopWordsAddition { impl StopWordsUpdate {
pub fn new( pub fn new(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter, updates_notifier: UpdateEventsEmitter,
) -> StopWordsAddition { ) -> StopWordsUpdate {
StopWordsAddition { StopWordsUpdate {
updates_store, updates_store,
updates_results_store, updates_results_store,
updates_notifier, updates_notifier,
@ -36,7 +37,7 @@ impl StopWordsAddition {
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> { pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_stop_words_addition( let update_id = push_stop_words_update(
writer, writer,
self.updates_store, self.updates_store,
self.updates_results_store, self.updates_results_store,
@ -46,21 +47,64 @@ impl StopWordsAddition {
} }
} }
pub fn push_stop_words_addition( pub fn push_stop_words_update(
writer: &mut heed::RwTxn<UpdateT>, writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
addition: BTreeSet<String>, update: BTreeSet<String>,
) -> MResult<u64> { ) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::stop_words_addition(addition); let update = Update::stop_words_update(update);
updates_store.put_update(writer, last_update_id, &update)?; updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id) Ok(last_update_id)
} }
pub fn apply_stop_words_addition( pub fn apply_stop_words_update(
writer: &mut heed::RwTxn<MainT>,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
stop_words: BTreeSet<String>,
) -> MResult<()> {
let old_stop_words: BTreeSet<String> = main_store
.stop_words_fst(writer)?
.unwrap_or_default()
.stream()
.into_strs().unwrap().into_iter().collect();
let deletion: BTreeSet<String> = old_stop_words.clone().difference(&stop_words).cloned().collect();
let addition: BTreeSet<String> = stop_words.clone().difference(&old_stop_words).cloned().collect();
if !addition.is_empty() {
apply_stop_words_addition(
writer,
main_store,
postings_lists_store,
addition
)?;
}
if !deletion.is_empty() {
apply_stop_words_deletion(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
deletion
)?;
}
Ok(())
}
fn apply_stop_words_addition(
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
main_store: store::Main, main_store: store::Main,
postings_lists_store: store::PostingsLists, postings_lists_store: store::PostingsLists,
@ -116,3 +160,59 @@ pub fn apply_stop_words_addition(
Ok(()) Ok(())
} }
fn apply_stop_words_deletion(
writer: &mut heed::RwTxn<MainT>,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word).unwrap();
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// now we delete all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
// now that we have setup the stop words
// lets reindex everything...
if let Ok(number) = main_store.number_of_documents(writer) {
if number > 0 {
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?;
}
}
Ok(())
}

View File

@ -80,16 +80,12 @@ pub fn load_routes(app: &mut tide::App<Data>) {
.get(synonym::get) .get(synonym::get)
.post(synonym::update); .post(synonym::update);
router.at("/stop-words").nest(|router| { router.at("/settings").nest(|router| {
router router.at("/stop-words")
.at("/") .get(stop_words::get)
.get(stop_words::list) .post(stop_words::update)
.patch(stop_words::add) .delete(stop_words::delete);
.post(stop_words::delete); })
});
router
.at("/settings")
.get(setting::get) .get(setting::get)
.post(setting::update); .post(setting::update);
}); });

View File

@ -8,7 +8,7 @@ use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse; use crate::routes::document::IndexUpdateResponse;
use crate::Data; use crate::Data;
pub async fn list(ctx: Context<Data>) -> SResult<Response> { pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?; ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?; let index = ctx.index()?;
@ -29,7 +29,7 @@ pub async fn list(ctx: Context<Data>) -> SResult<Response> {
Ok(tide::response::json(stop_words)) Ok(tide::response::json(stop_words))
} }
pub async fn add(mut ctx: Context<Data>) -> SResult<Response> { pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?; ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?; let index = ctx.index()?;
@ -38,12 +38,12 @@ pub async fn add(mut ctx: Context<Data>) -> SResult<Response> {
let db = &ctx.state().db; let db = &ctx.state().db;
let mut writer = db.update_write_txn().map_err(ResponseError::internal)?; let mut writer = db.update_write_txn().map_err(ResponseError::internal)?;
let mut stop_words_addition = index.stop_words_addition(); let mut stop_words_update = index.stop_words_update();
for stop_word in data { for stop_word in data {
stop_words_addition.add_stop_word(stop_word); stop_words_update.add_stop_word(stop_word);
} }
let update_id = stop_words_addition let update_id = stop_words_update
.finalize(&mut writer) .finalize(&mut writer)
.map_err(ResponseError::internal)?; .map_err(ResponseError::internal)?;
@ -55,19 +55,14 @@ pub async fn add(mut ctx: Context<Data>) -> SResult<Response> {
.into_response()) .into_response())
} }
pub async fn delete(mut ctx: Context<Data>) -> SResult<Response> { pub async fn delete(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?; ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?; let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let db = &ctx.state().db; let db = &ctx.state().db;
let mut writer = db.update_write_txn().map_err(ResponseError::internal)?; let mut writer = db.update_write_txn().map_err(ResponseError::internal)?;
let mut stop_words_deletion = index.stop_words_deletion(); let stop_words_deletion = index.stop_words_update();
for stop_word in data {
stop_words_deletion.delete_stop_word(stop_word);
}
let update_id = stop_words_deletion let update_id = stop_words_deletion
.finalize(&mut writer) .finalize(&mut writer)