feat(search, update): synonyms

This commit is contained in:
Alexey Shekhirin 2021-04-07 11:53:57 +03:00
parent 995d1a07d4
commit e39aabbfe6
No known key found for this signature in database
GPG Key ID: AF9A26AA133B5B98
4 changed files with 132 additions and 26 deletions

View File

@ -260,6 +260,9 @@ struct Settings {
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
stop_words: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
synonyms: Setting<HashMap<String, Vec<String>>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -431,6 +434,13 @@ async fn main() -> anyhow::Result<()> {
Setting::NotSet => ()
}
// We transpose the settings JSON struct into a real setting update.
match settings.synonyms {
Setting::Set(synonyms) => builder.set_synonyms(synonyms),
Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => ()
}
let result = builder.execute(|indexing_step, update_id| {
let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
@ -1011,6 +1021,7 @@ mod tests {
faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::NotSet
};
assert_tokens(&settings, &[
@ -1053,6 +1064,7 @@ mod tests {
faceted_attributes: Setting::Reset,
criteria: Setting::Reset,
stop_words: Setting::Reset,
synonyms: Setting::NotSet
};
assert_tokens(&settings, &[

View File

@ -3,19 +3,19 @@ use std::collections::HashMap;
use std::path::Path;
use anyhow::Context;
use chrono::{DateTime, Utc};
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
use heed::types::*;
use heed::{PolyDatabase, Database, RwTxn, RoTxn};
use roaring::RoaringBitmap;
use chrono::{Utc, DateTime};
use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search};
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec,
};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution};
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
use crate::{
RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec,
StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
};
pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
@ -31,6 +31,7 @@ pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
pub const WORDS_FST_KEY: &str = "words-fst";
pub const STOP_WORDS_KEY: &str = "stop-words";
pub const SYNONYMS_KEY: &str = "synonyms";
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
const CREATED_AT_KEY: &str = "created-at";
const UPDATED_AT_KEY: &str = "updated-at";
@ -376,12 +377,12 @@ impl Index {
/* words fst */
/// Writes the FST which is the words dictionnary of the engine.
/// Writes the FST which is the words dictionary of the engine.
pub fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())
}
/// Returns the FST which is the words dictionnary of the engine.
/// Returns the FST which is the words dictionary of the engine.
pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<fst::Set<Cow<'t, [u8]>>> {
match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? {
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
@ -398,6 +399,7 @@ impl Index {
pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY)
}
pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? {
Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
@ -405,6 +407,34 @@ impl Index {
}
}
/* synonyms */
pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms)
}
pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY)
}
pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result<Option<HashMap<Vec<String>, Vec<Vec<String>>>>> {
match self.main.get::<_, Str, SerdeBincode<HashMap<Vec<String>, Vec<Vec<String>>>>>(rtxn, SYNONYMS_KEY)? {
Some(synonyms) => Ok(Some(synonyms)),
None => Ok(None),
}
}
pub fn words_synonyms<S: AsRef<str>>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect();
match self.synonyms(rtxn)? {
Some(synonyms) => Ok(Some(
synonyms.get(&words).cloned().unwrap_or(Vec::default())
)),
None => Ok(None)
}
}
/* words prefixes fst */
/// Writes the FST which is the words prefixes dictionnary of the engine.
@ -536,7 +566,7 @@ pub(crate) mod tests {
let rtxn = index.read_txn().unwrap();
let fields_distribution = index.fields_distribution(&rtxn).unwrap();
assert_eq!(fields_distribution, hashmap!{
assert_eq!(fields_distribution, hashmap! {
"name".to_string() => 2,
"age".to_string() => 1,
});

View File

@ -155,7 +155,7 @@ impl fmt::Debug for Query {
trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? {
Some(rb) => Ok(Some(rb.len())),
@ -177,12 +177,12 @@ impl<'a> Context for QueryTreeBuilder<'a> {
self.index.word_docids.get(self.rtxn, word)
}
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
self.index.word_documents_count(self.rtxn, word)
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> {
self.index.words_synonyms(self.rtxn, words)
}
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
Ok(None)
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
self.index.word_documents_count(self.rtxn, word)
}
}
@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind {
}
}
/// Fetch synonyms from the `Context` for the provided word
/// Fetch synonyms from the `Context` for the provided words
/// and create the list of operations for the query tree
fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operation>>> {
let synonyms = ctx.synonyms(word)?;
fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result<Option<Vec<Operation>>> {
let synonyms = ctx.synonyms(words)?;
Ok(synonyms.map(|synonyms| {
synonyms.into_iter().map(|synonym| {
@ -581,14 +581,13 @@ mod test {
Ok(self.postings.get(word).cloned())
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect();
Ok(self.synonyms.get(&words).cloned())
}
}
impl Default for TestContext {
fn default() -> TestContext {
let mut rng = StdRng::seed_from_u64(102);
let rng = &mut rng;

View File

@ -13,6 +13,7 @@ use crate::criterion::Criterion;
use crate::facet::FacetType;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::update::index_documents::{IndexDocumentsMethod, Transform};
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
#[derive(Debug, Clone, PartialEq)]
pub enum Setting<T> {
@ -71,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
criteria: Setting<Vec<String>>,
stop_words: Setting<BTreeSet<String>>,
distinct_attribute: Setting<String>,
synonyms: Setting<HashMap<String, Vec<String>>>,
}
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -96,6 +98,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
criteria: Setting::NotSet,
stop_words: Setting::NotSet,
distinct_attribute: Setting::NotSet,
synonyms: Setting::NotSet,
update_id,
}
}
@ -144,12 +147,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
pub fn reset_distinct_attribute(&mut self) {
self.distinct_attribute = Setting::Reset;
}
pub fn set_distinct_attribute(&mut self, distinct_attribute: String) {
self.distinct_attribute = Setting::Set(distinct_attribute);
}
pub fn reset_distinct_attribute(&mut self) {
self.distinct_attribute = Setting::Reset;
pub fn reset_synonyms(&mut self) {
self.synonyms = Setting::Reset;
}
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
self.synonyms = if synonyms.is_empty() {
Setting::Reset
} else {
Setting::Set(synonyms)
}
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
@ -294,7 +309,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let current = self.index.stop_words(self.wtxn)?;
// since we can't compare a BTreeSet with an FST we are going to convert the
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
let fst = fst::Set::from_iter(&*stop_words)?;
let fst = fst::Set::from_iter(stop_words)?;
// Does the new FST differ from the previous one?
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
@ -310,6 +325,55 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
fn update_synonyms(&mut self) -> anyhow::Result<bool> {
match self.synonyms {
Setting::Set(ref synonyms) => {
let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default();
let mut config = AnalyzerConfig::default();
let stop_words = self.index.stop_words(self.wtxn)?;
if let Some(stop_words) = &stop_words {
config.stop_words(stop_words);
}
let analyzer = Analyzer::new(config);
let normalize = |text: &String| {
analyzer
.analyze(text)
.tokens()
.filter_map(|token|
if token.is_word() { Some(token.text().to_string()) } else { None }
)
.collect::<Vec<_>>()
};
let new_synonyms = synonyms
.iter()
.map(|(word, synonyms)| {
let normalized_word = normalize(word);
let normalized_synonyms = synonyms.iter()
.map(normalize)
.unique()
.collect::<Vec<_>>();
(normalized_word, normalized_synonyms)
})
.collect();
if new_synonyms != old_synonyms {
self.index.put_synonyms(self.wtxn, &new_synonyms)?;
Ok(true)
} else {
Ok(false)
}
}
Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?),
Setting::NotSet => Ok(false),
}
}
fn update_facets(&mut self) -> anyhow::Result<bool> {
match self.faceted_fields {
Setting::Set(ref fields) => {
@ -359,9 +423,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
// update_criteria MUST be called after update_facets, since criterion fields must be set
// as facets.
self.update_criteria()?;
let synonyms_updated = self.update_synonyms()?;
let searchable_updated = self.update_searchable()?;
if facets_updated || searchable_updated || stop_words_updated {
if stop_words_updated || facets_updated || synonyms_updated || searchable_updated {
self.reindex(&progress_callback, old_fields_ids_map)?;
}
Ok(())