mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
Merge pull request #160 from meilisearch/synonyms
Support all types of synonyms
This commit is contained in:
commit
1b0fd2e0ba
@ -6,6 +6,7 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.3.1"
|
||||
deunicode = "1.0.0"
|
||||
hashbrown = "0.2.2"
|
||||
lazy_static = "1.2.0"
|
||||
log = "0.4.6"
|
||||
@ -25,6 +26,9 @@ git = "https://github.com/Kerollmops/levenshtein-automata.git"
|
||||
branch = "arc-byte-slice"
|
||||
features = ["fst_automaton"]
|
||||
|
||||
[dev-dependencies]
|
||||
assert_matches = "1.3"
|
||||
|
||||
[features]
|
||||
i128 = ["byteorder/i128"]
|
||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||
|
@ -113,7 +113,7 @@ impl<'a> Default for Criteria<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
|
||||
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
||||
&self.inner
|
||||
}
|
||||
|
@ -1,3 +1,6 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use] extern crate assert_matches;
|
||||
|
||||
mod automaton;
|
||||
mod distinct_map;
|
||||
mod query_builder;
|
||||
@ -7,12 +10,12 @@ pub mod criterion;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use sdset::SetBuf;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use slice_group_by::GroupBy;
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||
pub use self::store::Store;
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
@ -226,12 +229,10 @@ impl fmt::Debug for RawDocument {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||
pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges = Vec::<(_, Range)>::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
matches.par_sort_unstable();
|
||||
|
||||
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||
let id = group[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||
|
@ -4,35 +4,111 @@ use std::rc::Rc;
|
||||
use std::time::Instant;
|
||||
use std::{cmp, mem};
|
||||
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use slice_group_by::GroupByMut;
|
||||
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use fst::Streamer;
|
||||
use log::info;
|
||||
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupByMut;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa};
|
||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::criterion::Criteria;
|
||||
use crate::raw_documents_from_matches;
|
||||
use crate::{Match, DocumentId, Store, RawDocument, Document};
|
||||
|
||||
fn generate_automatons(query: &str) -> Vec<DfaExt> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let mut groups = split_query_string(query).map(str::to_lowercase).peekable();
|
||||
let mut automatons = Vec::new();
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
while let Some(word) = groups.next() {
|
||||
let has_following_word = groups.peek().is_some();
|
||||
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
|
||||
automaton::build_dfa(&word)
|
||||
} else {
|
||||
automaton::build_prefix_dfa(&word)
|
||||
};
|
||||
automatons.push(lev);
|
||||
struct Automaton {
|
||||
index: usize,
|
||||
is_synonym: bool,
|
||||
number_words: usize,
|
||||
dfa: DfaExt,
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
fn synonym(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
|
||||
Automaton { index, is_synonym: true, number_words, dfa }
|
||||
}
|
||||
|
||||
automatons
|
||||
fn original(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
|
||||
Automaton { index, is_synonym: false, number_words, dfa }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
||||
if !string.contains(is_cjk) {
|
||||
string = deunicode::deunicode_with_tofu(&string, "");
|
||||
}
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
let synonyms = store.synonyms()?;
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut index = 0;
|
||||
let mut ngrams = query_words.windows(n).peekable();
|
||||
|
||||
while let Some(ngram) = ngrams.next() {
|
||||
let ngram_nb_words = ngram.len();
|
||||
let ngram = ngram.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
let lev = {
|
||||
let normalized = normalize_str(&ngram);
|
||||
if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }
|
||||
};
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words { continue }
|
||||
|
||||
if let Some(synonyms) = store.alternatives_to(base.as_bytes())? {
|
||||
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let nb_synonym_words = split_query_string(synonyms).count();
|
||||
for synonym in split_query_string(synonyms) {
|
||||
let lev = build_dfa(synonym);
|
||||
let automaton = Automaton::synonym(index, nb_synonym_words, lev);
|
||||
automatons.push((automaton, synonym.to_owned()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
||||
let automaton = Automaton::original(index, ngram_nb_words, lev);
|
||||
automatons.push((automaton, ngram));
|
||||
}
|
||||
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
automatons.sort_unstable_by(|a, b| (a.0.index, &a.1).cmp(&(b.0.index, &b.1)));
|
||||
automatons.dedup_by(|a, b| (a.0.index, &a.1) == (b.0.index, &b.1));
|
||||
let automatons = automatons.into_iter().map(|(a, _)| a).collect();
|
||||
|
||||
Ok(automatons)
|
||||
}
|
||||
|
||||
pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> {
|
||||
@ -61,7 +137,7 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI>
|
||||
store: self.store,
|
||||
criteria: self.criteria,
|
||||
searchable_attrs: self.searchable_attrs,
|
||||
filter: Some(function)
|
||||
filter: Some(function),
|
||||
}
|
||||
}
|
||||
|
||||
@ -82,13 +158,13 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI>
|
||||
where S: Store,
|
||||
{
|
||||
fn query_all(&self, query: &str) -> Result<Vec<RawDocument>, S::Error> {
|
||||
let automatons = generate_automatons(query);
|
||||
let automatons = generate_automatons(query, &self.store)?;
|
||||
let words = self.store.words()?.as_fst();
|
||||
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::raw::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = words.search(automaton);
|
||||
for Automaton { dfa, .. } in &automatons {
|
||||
let stream = words.search(dfa);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.r#union()
|
||||
@ -98,9 +174,9 @@ where S: Store,
|
||||
|
||||
while let Some((input, indexed_values)) = stream.next() {
|
||||
for iv in indexed_values {
|
||||
let automaton = &automatons[iv.index];
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
let Automaton { index, is_synonym, number_words, ref dfa } = automatons[iv.index];
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = (is_synonym && number_words == 1) || (!is_synonym && distance == 0 && input.len() == dfa.query_len());
|
||||
|
||||
let doc_indexes = self.store.word_indexes(input)?;
|
||||
let doc_indexes = match doc_indexes {
|
||||
@ -111,8 +187,8 @@ where S: Store,
|
||||
for di in doc_indexes.as_slice() {
|
||||
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
||||
let match_ = Match {
|
||||
query_index: iv.index as u32,
|
||||
distance,
|
||||
query_index: index as u32,
|
||||
distance: distance,
|
||||
attribute: di.attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
@ -125,8 +201,22 @@ where S: Store,
|
||||
}
|
||||
}
|
||||
|
||||
matches.par_sort_unstable();
|
||||
|
||||
for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
|
||||
let mut offset = 0;
|
||||
for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
|
||||
let word_index = query_indexes[0].1.word_index - offset as u16;
|
||||
for (_, match_) in query_indexes.iter_mut() {
|
||||
match_.word_index = word_index;
|
||||
}
|
||||
offset += query_indexes.len() - 1;
|
||||
}
|
||||
}
|
||||
|
||||
let total_matches = matches.len();
|
||||
let raw_documents = raw_documents_from_matches(matches);
|
||||
let padded_matches = SetBuf::from_dirty(matches);
|
||||
let raw_documents = raw_documents_from_matches(padded_matches);
|
||||
|
||||
info!("{} total documents to classify", raw_documents.len());
|
||||
info!("{} total matches to classify", total_matches);
|
||||
@ -321,3 +411,621 @@ where S: Store,
|
||||
Ok(out_documents)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use sdset::SetBuf;
|
||||
use fst::{Set, IntoStreamer};
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::store::Store;
|
||||
|
||||
#[derive(Default)]
|
||||
struct InMemorySetStore {
|
||||
set: Set,
|
||||
synonyms: Set,
|
||||
indexes: HashMap<Vec<u8>, SetBuf<DocIndex>>,
|
||||
alternatives: HashMap<Vec<u8>, Set>,
|
||||
}
|
||||
|
||||
fn set_from_stream<'f, I, S>(stream: I) -> Set
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>,
|
||||
S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>,
|
||||
{
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(stream).unwrap();
|
||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||
}
|
||||
|
||||
fn insert_key(set: &Set, key: &[u8]) -> Set {
|
||||
let unique_key = {
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.insert(key).unwrap();
|
||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||
};
|
||||
|
||||
let union_ = set.op().add(unique_key.into_stream()).r#union();
|
||||
|
||||
set_from_stream(union_)
|
||||
}
|
||||
|
||||
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
|
||||
builder.extend_iter(set.into_iter()).unwrap();
|
||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||
}
|
||||
|
||||
impl InMemorySetStore {
|
||||
pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) {
|
||||
let word = word.to_lowercase();
|
||||
let alternatives = self.alternatives.entry(word.as_bytes().to_vec()).or_default();
|
||||
let new = sdset_into_fstset(&new);
|
||||
*alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union());
|
||||
|
||||
self.synonyms = insert_key(&self.synonyms, word.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for InMemorySetStore {
|
||||
fn from_iter<I: IntoIterator<Item=(&'a str, &'a [DocIndex])>>(iter: I) -> Self {
|
||||
let mut tree = BTreeSet::new();
|
||||
let mut map = HashMap::new();
|
||||
|
||||
for (word, indexes) in iter {
|
||||
let word = word.to_lowercase().into_bytes();
|
||||
tree.insert(word.clone());
|
||||
map.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
InMemorySetStore {
|
||||
set: Set::from_iter(tree).unwrap(),
|
||||
synonyms: Set::default(),
|
||||
indexes: map.into_iter().map(|(k, v)| (k, SetBuf::from_dirty(v))).collect(),
|
||||
alternatives: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Store for InMemorySetStore {
|
||||
type Error = std::io::Error;
|
||||
|
||||
fn words(&self) -> Result<&Set, Self::Error> {
|
||||
Ok(&self.set)
|
||||
}
|
||||
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
|
||||
Ok(self.indexes.get(word).cloned())
|
||||
}
|
||||
|
||||
fn synonyms(&self) -> Result<&Set, Self::Error> {
|
||||
Ok(&self.synonyms)
|
||||
}
|
||||
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
|
||||
Ok(self.alternatives.get(word).map(|s| Set::from_bytes(s.as_fst().to_vec()).unwrap()))
|
||||
}
|
||||
}
|
||||
|
||||
const fn doc_index(document_id: u64, word_index: u16) -> DocIndex {
|
||||
DocIndex {
|
||||
document_id: DocumentId(document_id),
|
||||
attribute: 0,
|
||||
word_index,
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex {
|
||||
DocIndex {
|
||||
document_id: DocumentId(document_id),
|
||||
attribute: 0,
|
||||
word_index,
|
||||
char_index,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("hello", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("bonjour", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("sal", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("bonj", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("sal blabla", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("bonj blabla", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn levenshtein_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("salutution", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("saluttion", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn harder_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
("bonjour", &[doc_index(1, 3)]),
|
||||
("salut", &[doc_index(2, 5)]),
|
||||
]);
|
||||
|
||||
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"]));
|
||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("hello", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 3);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 5);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("bonjour", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 3);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 5);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("salut", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 0);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 3);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||
assert_eq!(matches.len(), 1);
|
||||
let match_ = matches[0];
|
||||
assert_eq!(match_.query_index, 0);
|
||||
assert_eq!(match_.word_index, 5);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Unique word has multi-word synonyms
|
||||
fn unique_to_multiword_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("subway", &[doc_char_index(0, 3, 3)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(1, 1, 1)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NY subway", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Unique word has multi-word synonyms
|
||||
fn harder_unique_to_multiword_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("subway", &[doc_char_index(0, 4, 4)][..]),
|
||||
("broken", &[doc_char_index(0, 5, 5)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NY subway", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Unique word has multi-word synonyms
|
||||
fn even_harder_unique_to_multiword_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("underground", &[doc_char_index(0, 4, 4)][..]),
|
||||
("train", &[doc_char_index(0, 5, 5)][..]),
|
||||
("broken", &[doc_char_index(0, 6, 6)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NY subway broken", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 3, .. })); // broken
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway
|
||||
assert_matches!(iter.next(), None); // position rewritten ^
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Multi-word has multi-word synonyms
|
||||
fn multiword_to_multiword_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||
|
||||
("NYC", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]));
|
||||
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"]));
|
||||
store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("new york underground train broken", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 2, .. })); // subway = underground train
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, .. })); // broken
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway = underground train
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("new york city underground train broken", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, .. })); // subway = underground train
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, .. })); // broken
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york city
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, .. })); // subway = underground train
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deunicoded_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("iPhone", &[doc_index(0, 0)][..]),
|
||||
("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded
|
||||
("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex
|
||||
]);
|
||||
|
||||
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("telephone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("téléphone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("télephone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -8,6 +8,9 @@ pub trait Store {
|
||||
|
||||
fn words(&self) -> Result<&Set, Self::Error>;
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error>;
|
||||
|
||||
fn synonyms(&self) -> Result<&Set, Self::Error>;
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error>;
|
||||
}
|
||||
|
||||
impl<T> Store for &'_ T where T: Store {
|
||||
@ -20,4 +23,12 @@ impl<T> Store for &'_ T where T: Store {
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
|
||||
(*self).word_indexes(word)
|
||||
}
|
||||
|
||||
fn synonyms(&self) -> Result<&Set, Self::Error> {
|
||||
(*self).synonyms()
|
||||
}
|
||||
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
|
||||
(*self).alternatives_to(word)
|
||||
}
|
||||
}
|
||||
|
@ -120,11 +120,12 @@ impl<'a> DocumentsAddition<'a> {
|
||||
|
||||
// update the "consistent" view of the Index
|
||||
let ranked_map = self.ranked_map;
|
||||
let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone()
|
||||
let schema = lease_inner.schema.clone();
|
||||
let raw = lease_inner.raw.clone();
|
||||
lease_inner.raw.compact();
|
||||
|
||||
let inner = InnerIndex { words, schema, ranked_map, raw };
|
||||
let inner = InnerIndex { words, synonyms, schema, ranked_map, raw };
|
||||
self.inner.0.store(Arc::new(inner));
|
||||
|
||||
Ok(())
|
||||
|
@ -119,11 +119,12 @@ impl<'a> DocumentsDeletion<'a> {
|
||||
|
||||
// update the "consistent" view of the Index
|
||||
let ranked_map = lease_inner.ranked_map.clone();
|
||||
let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone()
|
||||
let schema = lease_inner.schema.clone();
|
||||
let raw = lease_inner.raw.clone();
|
||||
lease_inner.raw.compact();
|
||||
|
||||
let inner = InnerIndex { words, schema, ranked_map, raw };
|
||||
let inner = InnerIndex { words, synonyms, schema, ranked_map, raw };
|
||||
self.inner.0.store(Arc::new(inner));
|
||||
|
||||
Ok(())
|
||||
|
@ -13,7 +13,11 @@ use crate::ranked_map::RankedMap;
|
||||
use crate::serde::Deserializer;
|
||||
|
||||
use super::{Error, CustomSettings};
|
||||
use super::{RawIndex, DocumentsAddition, DocumentsDeletion};
|
||||
use super::{
|
||||
RawIndex,
|
||||
DocumentsAddition, DocumentsDeletion,
|
||||
SynonymsAddition, SynonymsDeletion,
|
||||
};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct IndexStats {
|
||||
@ -27,6 +31,7 @@ pub struct Index(pub ArcSwap<InnerIndex>);
|
||||
|
||||
pub struct InnerIndex {
|
||||
pub words: fst::Set,
|
||||
pub synonyms: fst::Set,
|
||||
pub schema: Schema,
|
||||
pub ranked_map: RankedMap,
|
||||
pub raw: RawIndex, // TODO this will be a snapshot in the future
|
||||
@ -39,6 +44,11 @@ impl Index {
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let synonyms = match raw.main.synonyms_set()? {
|
||||
Some(synonyms) => synonyms,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let schema = match raw.main.schema()? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
@ -49,7 +59,7 @@ impl Index {
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let inner = InnerIndex { words, schema, ranked_map, raw };
|
||||
let inner = InnerIndex { words, synonyms, schema, ranked_map, raw };
|
||||
let index = Index(ArcSwap::new(Arc::new(inner)));
|
||||
|
||||
Ok(index)
|
||||
@ -101,6 +111,14 @@ impl Index {
|
||||
DocumentsDeletion::new(self, ranked_map)
|
||||
}
|
||||
|
||||
pub fn synonyms_addition(&self) -> SynonymsAddition {
|
||||
SynonymsAddition::new(self)
|
||||
}
|
||||
|
||||
pub fn synonyms_deletion(&self) -> SynonymsDeletion {
|
||||
SynonymsDeletion::new(self)
|
||||
}
|
||||
|
||||
pub fn document<T>(
|
||||
&self,
|
||||
fields: Option<&HashSet<&str>>,
|
||||
@ -141,4 +159,12 @@ impl Store for IndexLease {
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
|
||||
Ok(self.0.raw.words.doc_indexes(word)?)
|
||||
}
|
||||
|
||||
fn synonyms(&self) -> Result<&fst::Set, Self::Error> {
|
||||
Ok(&self.0.synonyms)
|
||||
}
|
||||
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<fst::Set>, Self::Error> {
|
||||
Ok(self.0.raw.synonyms.alternatives_to(word)?)
|
||||
}
|
||||
}
|
||||
|
@ -44,6 +44,22 @@ impl MainIndex {
|
||||
self.0.set("words", value.as_fst().as_bytes()).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn synonyms_set(&self) -> Result<Option<fst::Set>, Error> {
|
||||
match self.0.get_pinned("synonyms")? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let value = Arc::from(bytes.as_ref());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> {
|
||||
self.0.set("synonyms", value.as_fst().as_bytes()).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn ranked_map(&self) -> Result<Option<RankedMap>, Error> {
|
||||
match self.0.get_pinned("ranked-map")? {
|
||||
Some(bytes) => {
|
||||
|
@ -13,6 +13,9 @@ mod error;
|
||||
mod index;
|
||||
mod main_index;
|
||||
mod raw_index;
|
||||
mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
mod synonyms_index;
|
||||
mod words_index;
|
||||
|
||||
pub use self::error::Error;
|
||||
@ -22,11 +25,14 @@ pub use self::custom_settings::CustomSettings;
|
||||
use self::docs_words_index::DocsWordsIndex;
|
||||
use self::documents_addition::DocumentsAddition;
|
||||
use self::documents_deletion::DocumentsDeletion;
|
||||
use self::synonyms_addition::SynonymsAddition;
|
||||
use self::synonyms_deletion::SynonymsDeletion;
|
||||
use self::documents_index::DocumentsIndex;
|
||||
use self::index::InnerIndex;
|
||||
use self::main_index::MainIndex;
|
||||
use self::raw_index::{RawIndex, InnerRawIndex};
|
||||
use self::words_index::WordsIndex;
|
||||
use self::synonyms_index::SynonymsIndex;
|
||||
|
||||
pub struct Database {
|
||||
cache: RwLock<HashMap<String, Arc<Index>>>,
|
||||
@ -99,6 +105,12 @@ impl Database {
|
||||
MainIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(name)))
|
||||
};
|
||||
|
||||
let synonyms = {
|
||||
let cf_name = format!("{}-synonyms", name);
|
||||
self.inner.cf_handle(&cf_name).expect("cf not found");
|
||||
SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name)))
|
||||
};
|
||||
|
||||
let words = {
|
||||
let cf_name = format!("{}-words", name);
|
||||
self.inner.cf_handle(&cf_name).expect("cf not found");
|
||||
@ -123,7 +135,7 @@ impl Database {
|
||||
CustomSettings(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name)))
|
||||
};
|
||||
|
||||
let raw_index = RawIndex { main, words, docs_words, documents, custom };
|
||||
let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom };
|
||||
let index = Index::from_raw(raw_index)?;
|
||||
|
||||
vacant.insert(Arc::new(index)).clone()
|
||||
@ -154,6 +166,12 @@ impl Database {
|
||||
|
||||
main.set_schema(&schema)?;
|
||||
|
||||
let synonyms = {
|
||||
let cf_name = format!("{}-synonyms", name);
|
||||
self.inner.create_cf(&cf_name, &rocksdb::Options::default())?;
|
||||
SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name)))
|
||||
};
|
||||
|
||||
let words = {
|
||||
let cf_name = format!("{}-words", name);
|
||||
self.inner.create_cf(&cf_name, &rocksdb::Options::default())?;
|
||||
@ -182,7 +200,7 @@ impl Database {
|
||||
indexes.insert(name.to_string());
|
||||
self.set_indexes(&indexes)?;
|
||||
|
||||
let raw_index = RawIndex { main, words, docs_words, documents, custom };
|
||||
let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom };
|
||||
let index = Index::from_raw(raw_index)?;
|
||||
|
||||
vacant.insert(Arc::new(index)).clone()
|
||||
|
@ -1,9 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings};
|
||||
use super::{MainIndex, SynonymsIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawIndex {
|
||||
pub main: MainIndex,
|
||||
pub synonyms: SynonymsIndex,
|
||||
pub words: WordsIndex,
|
||||
pub docs_words: DocsWordsIndex,
|
||||
pub documents: DocumentsIndex,
|
||||
@ -13,6 +14,7 @@ pub struct RawIndex {
|
||||
impl RawIndex {
|
||||
pub(crate) fn compact(&self) {
|
||||
self.main.0.compact_range(None::<&[u8]>, None::<&[u8]>);
|
||||
self.synonyms.0.compact_range(None::<&[u8]>, None::<&[u8]>);
|
||||
self.words.0.compact_range(None::<&[u8]>, None::<&[u8]>);
|
||||
self.docs_words.0.compact_range(None::<&[u8]>, None::<&[u8]>);
|
||||
self.documents.0.compact_range(None::<&[u8]>, None::<&[u8]>);
|
||||
|
86
meilidb-data/src/database/synonyms_addition.rs
Normal file
86
meilidb-data/src/database/synonyms_addition.rs
Normal file
@ -0,0 +1,86 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use meilidb_core::normalize_str;
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::database::index::InnerIndex;
|
||||
use super::{Error, Index};
|
||||
|
||||
pub struct SynonymsAddition<'a> {
|
||||
inner: &'a Index,
|
||||
synonyms: BTreeMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
impl<'a> SynonymsAddition<'a> {
|
||||
pub fn new(inner: &'a Index) -> SynonymsAddition<'a> {
|
||||
SynonymsAddition { inner, synonyms: BTreeMap::new() }
|
||||
}
|
||||
|
||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item=T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> Result<(), Error> {
|
||||
let lease_inner = self.inner.lease_inner();
|
||||
let synonyms = &lease_inner.raw.synonyms;
|
||||
let main = &lease_inner.raw.main;
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, alternatives) in self.synonyms {
|
||||
synonyms_builder.insert(&synonym).unwrap();
|
||||
|
||||
let alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut alternatives_builder = SetBuilder::memory();
|
||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
||||
alternatives_builder.into_inner().unwrap()
|
||||
};
|
||||
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
|
||||
}
|
||||
|
||||
let delta_synonyms = synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main.synonyms_set()? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.r#union();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => delta_synonyms,
|
||||
};
|
||||
|
||||
main.set_synonyms_set(&synonyms)?;
|
||||
|
||||
// update the "consistent" view of the Index
|
||||
let words = main.words_set()?.unwrap_or_default();
|
||||
let ranked_map = lease_inner.ranked_map.clone();;
|
||||
let schema = lease_inner.schema.clone();
|
||||
let raw = lease_inner.raw.clone();
|
||||
lease_inner.raw.compact();
|
||||
|
||||
let inner = InnerIndex { words, synonyms, schema, ranked_map, raw };
|
||||
self.inner.0.store(Arc::new(inner));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
129
meilidb-data/src/database/synonyms_deletion.rs
Normal file
129
meilidb-data/src/database/synonyms_deletion.rs
Normal file
@ -0,0 +1,129 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::iter::FromIterator;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use meilidb_core::normalize_str;
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::database::index::InnerIndex;
|
||||
use super::{Error, Index};
|
||||
|
||||
pub struct SynonymsDeletion<'a> {
|
||||
inner: &'a Index,
|
||||
synonyms: BTreeMap<String, Option<Vec<String>>>,
|
||||
}
|
||||
|
||||
impl<'a> SynonymsDeletion<'a> {
|
||||
pub fn new(inner: &'a Index) -> SynonymsDeletion<'a> {
|
||||
SynonymsDeletion { inner, synonyms: BTreeMap::new() }
|
||||
}
|
||||
|
||||
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
self.synonyms.insert(synonym, None);
|
||||
}
|
||||
|
||||
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item=T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let value = self.synonyms.entry(synonym).or_insert(None);
|
||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
||||
match value {
|
||||
Some(v) => v.extend(alternatives),
|
||||
None => *value = Some(Vec::from_iter(alternatives)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> Result<(), Error> {
|
||||
let lease_inner = self.inner.lease_inner();
|
||||
let synonyms = &lease_inner.raw.synonyms;
|
||||
let main = &lease_inner.raw.main;
|
||||
|
||||
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, alternatives) in self.synonyms {
|
||||
match alternatives {
|
||||
Some(alternatives) => {
|
||||
let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?;
|
||||
let prev_alternatives = match prev_alternatives {
|
||||
Some(alternatives) => alternatives,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let delta_alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut builder = SetBuilder::memory();
|
||||
builder.extend_iter(alternatives).unwrap();
|
||||
builder.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(prev_alternatives.stream())
|
||||
.add(delta_alternatives.stream())
|
||||
.difference();
|
||||
|
||||
let (alternatives, empty_alternatives) = {
|
||||
let mut builder = SetBuilder::memory();
|
||||
let len = builder.get_ref().len();
|
||||
builder.extend_stream(op).unwrap();
|
||||
let is_empty = len == builder.get_ref().len();
|
||||
let alternatives = builder.into_inner().unwrap();
|
||||
(alternatives, is_empty)
|
||||
};
|
||||
|
||||
if empty_alternatives {
|
||||
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
|
||||
} else {
|
||||
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
||||
synonyms.del_alternatives_of(synonym.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let delta_synonyms = delete_whole_synonym_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main.synonyms_set()? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.difference();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main.set_synonyms_set(&synonyms)?;
|
||||
|
||||
// update the "consistent" view of the Index
|
||||
let words = main.words_set()?.unwrap_or_default();
|
||||
let ranked_map = lease_inner.ranked_map.clone();
|
||||
let schema = lease_inner.schema.clone();
|
||||
let raw = lease_inner.raw.clone();
|
||||
lease_inner.raw.compact();
|
||||
|
||||
let inner = InnerIndex { words, synonyms, schema, ranked_map, raw };
|
||||
self.inner.0.store(Arc::new(inner));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
23
meilidb-data/src/database/synonyms_index.rs
Normal file
23
meilidb-data/src/database/synonyms_index.rs
Normal file
@ -0,0 +1,23 @@
|
||||
use crate::database::raw_index::InnerRawIndex;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SynonymsIndex(pub(crate) InnerRawIndex);
|
||||
|
||||
impl SynonymsIndex {
|
||||
pub fn alternatives_to(&self, word: &[u8]) -> Result<Option<fst::Set>, rocksdb::Error> {
|
||||
match self.0.get(word)? {
|
||||
Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_alternatives_to(&self, word: &[u8], value: Vec<u8>) -> Result<(), rocksdb::Error> {
|
||||
self.0.set(word, value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn del_alternatives_of(&self, word: &[u8]) -> Result<(), rocksdb::Error> {
|
||||
self.0.delete(word)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user