mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-12 07:58:54 +01:00
ensure the synonyms are updated when the tokenizer settings are changed
This commit is contained in:
parent
d57026cd96
commit
b0c1a9504a
@ -232,7 +232,7 @@ async fn advanced_synergies() {
|
|||||||
|
|
||||||
let (_response, _code) = index
|
let (_response, _code) = index
|
||||||
.update_settings(json!({
|
.update_settings(json!({
|
||||||
"dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
|
"dictionary": ["J.R.R.", "J. R. R."],
|
||||||
"synonyms": {
|
"synonyms": {
|
||||||
"J.R.R.": ["jrr", "J. R. R."],
|
"J.R.R.": ["jrr", "J. R. R."],
|
||||||
"J. R. R.": ["jrr", "J.R.R."],
|
"J. R. R.": ["jrr", "J.R.R."],
|
||||||
@ -347,6 +347,14 @@ async fn advanced_synergies() {
|
|||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
// Only update dictionary, the synonyms should be recomputed.
|
||||||
|
let (_response, _code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
index.wait_task(2).await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| {
|
.search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@ -65,6 +65,7 @@ pub mod main_key {
|
|||||||
pub const DICTIONARY_KEY: &str = "dictionary";
|
pub const DICTIONARY_KEY: &str = "dictionary";
|
||||||
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
|
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
|
||||||
pub const SYNONYMS_KEY: &str = "synonyms";
|
pub const SYNONYMS_KEY: &str = "synonyms";
|
||||||
|
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
|
||||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||||
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
|
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
|
||||||
pub const CREATED_AT_KEY: &str = "created-at";
|
pub const CREATED_AT_KEY: &str = "created-at";
|
||||||
@ -1138,12 +1139,29 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
|
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
|
||||||
|
user_defined_synonyms: &BTreeMap<String, Vec<String>>,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
|
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?;
|
||||||
|
self.main.put::<_, Str, SerdeBincode<_>>(
|
||||||
|
wtxn,
|
||||||
|
main_key::USER_DEFINED_SYNONYMS_KEY,
|
||||||
|
user_defined_synonyms,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||||
self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)
|
self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?;
|
||||||
|
self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn user_defined_synonyms(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
) -> heed::Result<BTreeMap<String, Vec<String>>> {
|
||||||
|
Ok(self
|
||||||
|
.main
|
||||||
|
.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)?
|
||||||
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
|
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
|
||||||
|
@ -2,7 +2,7 @@ use std::io::Cursor;
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{btreemap, hashset};
|
||||||
|
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
@ -33,7 +33,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
S("tag"),
|
S("tag"),
|
||||||
S("asc_desc_rank"),
|
S("asc_desc_rank"),
|
||||||
});
|
});
|
||||||
builder.set_synonyms(hashmap! {
|
builder.set_synonyms(btreemap! {
|
||||||
S("hello") => vec![S("good morning")],
|
S("hello") => vec![S("good morning")],
|
||||||
S("world") => vec![S("earth")],
|
S("world") => vec![S("earth")],
|
||||||
S("america") => vec![S("the united states")],
|
S("america") => vec![S("the united states")],
|
||||||
|
@ -15,7 +15,7 @@ they store fewer sprximities than the regular word sprximity DB.
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::new::tests::collect_field_values;
|
use crate::search::new::tests::collect_field_values;
|
||||||
@ -336,7 +336,7 @@ fn test_proximity_split_word() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.update_settings(|s| {
|
.update_settings(|s| {
|
||||||
let mut syns = HashMap::new();
|
let mut syns = BTreeMap::new();
|
||||||
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
|
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
|
||||||
s.set_synonyms(syns);
|
s.set_synonyms(syns);
|
||||||
})
|
})
|
||||||
|
@ -18,7 +18,7 @@ if `words` doesn't exist before it.
|
|||||||
14. Synonyms cost nothing according to the typo ranking rule
|
14. Synonyms cost nothing according to the typo ranking rule
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::new::tests::collect_field_values;
|
use crate::search::new::tests::collect_field_values;
|
||||||
@ -591,7 +591,7 @@ fn test_typo_synonyms() {
|
|||||||
.update_settings(|s| {
|
.update_settings(|s| {
|
||||||
s.set_criteria(vec![Criterion::Typo]);
|
s.set_criteria(vec![Criterion::Typo]);
|
||||||
|
|
||||||
let mut synonyms = HashMap::new();
|
let mut synonyms = BTreeMap::new();
|
||||||
synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
|
synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
|
||||||
synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);
|
synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||||
@ -116,7 +116,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
separator_tokens: Setting<BTreeSet<String>>,
|
separator_tokens: Setting<BTreeSet<String>>,
|
||||||
dictionary: Setting<BTreeSet<String>>,
|
dictionary: Setting<BTreeSet<String>>,
|
||||||
distinct_field: Setting<String>,
|
distinct_field: Setting<String>,
|
||||||
synonyms: Setting<HashMap<String, Vec<String>>>,
|
synonyms: Setting<BTreeMap<String, Vec<String>>>,
|
||||||
primary_key: Setting<String>,
|
primary_key: Setting<String>,
|
||||||
authorize_typos: Setting<bool>,
|
authorize_typos: Setting<bool>,
|
||||||
min_word_len_two_typos: Setting<u8>,
|
min_word_len_two_typos: Setting<u8>,
|
||||||
@ -256,7 +256,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.synonyms = Setting::Reset;
|
self.synonyms = Setting::Reset;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
|
pub fn set_synonyms(&mut self, synonyms: BTreeMap<String, Vec<String>>) {
|
||||||
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
|
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -508,8 +508,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// the synonyms must be updated if non separator tokens have been updated.
|
// the synonyms must be updated if non separator tokens have been updated.
|
||||||
if changes {
|
if changes && self.synonyms == Setting::NotSet {
|
||||||
self.update_synonyms()?;
|
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(changes)
|
Ok(changes)
|
||||||
@ -533,8 +533,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// the synonyms must be updated if separator tokens have been updated.
|
// the synonyms must be updated if separator tokens have been updated.
|
||||||
if changes {
|
if changes && self.synonyms == Setting::NotSet {
|
||||||
self.update_synonyms()?;
|
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(changes)
|
Ok(changes)
|
||||||
@ -558,8 +558,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// the synonyms must be updated if dictionary has been updated.
|
// the synonyms must be updated if dictionary has been updated.
|
||||||
if changes {
|
if changes && self.synonyms == Setting::NotSet {
|
||||||
self.update_synonyms()?;
|
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(changes)
|
Ok(changes)
|
||||||
@ -567,7 +567,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
|
|
||||||
fn update_synonyms(&mut self) -> Result<bool> {
|
fn update_synonyms(&mut self) -> Result<bool> {
|
||||||
match self.synonyms {
|
match self.synonyms {
|
||||||
Setting::Set(ref synonyms) => {
|
Setting::Set(ref user_synonyms) => {
|
||||||
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
|
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
|
||||||
tokenizer
|
tokenizer
|
||||||
.tokenize(text)
|
.tokenize(text)
|
||||||
@ -604,7 +604,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
let tokenizer = builder.build();
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
let mut new_synonyms = HashMap::new();
|
let mut new_synonyms = HashMap::new();
|
||||||
for (word, synonyms) in synonyms {
|
for (word, synonyms) in user_synonyms {
|
||||||
// Normalize both the word and associated synonyms.
|
// Normalize both the word and associated synonyms.
|
||||||
let normalized_word = normalize(&tokenizer, word);
|
let normalized_word = normalize(&tokenizer, word);
|
||||||
let normalized_synonyms =
|
let normalized_synonyms =
|
||||||
@ -625,7 +625,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
let old_synonyms = self.index.synonyms(self.wtxn)?;
|
let old_synonyms = self.index.synonyms(self.wtxn)?;
|
||||||
|
|
||||||
if new_synonyms != old_synonyms {
|
if new_synonyms != old_synonyms {
|
||||||
self.index.put_synonyms(self.wtxn, &new_synonyms)?;
|
self.index.put_synonyms(self.wtxn, &new_synonyms, &user_synonyms)?;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
} else {
|
} else {
|
||||||
Ok(false)
|
Ok(false)
|
||||||
@ -912,7 +912,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use maplit::{btreeset, hashmap, hashset};
|
use maplit::{btreemap, btreeset, hashset};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::error::Error;
|
use crate::error::Error;
|
||||||
@ -1378,7 +1378,7 @@ mod tests {
|
|||||||
// In the same transaction provide some synonyms
|
// In the same transaction provide some synonyms
|
||||||
index
|
index
|
||||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||||
settings.set_synonyms(hashmap! {
|
settings.set_synonyms(btreemap! {
|
||||||
"blini".to_string() => vec!["crepes".to_string()],
|
"blini".to_string() => vec!["crepes".to_string()],
|
||||||
"super like".to_string() => vec!["love".to_string()],
|
"super like".to_string() => vec!["love".to_string()],
|
||||||
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
|
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
|
||||||
|
@ -5,7 +5,7 @@ use std::io::Cursor;
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use either::{Either, Left, Right};
|
use either::{Either, Left, Right};
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{btreemap, hashset};
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
|
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
|
||||||
@ -51,7 +51,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
S("tag"),
|
S("tag"),
|
||||||
S("asc_desc_rank"),
|
S("asc_desc_rank"),
|
||||||
});
|
});
|
||||||
builder.set_synonyms(hashmap! {
|
builder.set_synonyms(btreemap! {
|
||||||
S("hello") => vec![S("good morning")],
|
S("hello") => vec![S("good morning")],
|
||||||
S("world") => vec![S("earth")],
|
S("world") => vec![S("earth")],
|
||||||
S("america") => vec![S("the united states")],
|
S("america") => vec![S("the united states")],
|
||||||
|
Loading…
Reference in New Issue
Block a user