mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Merge branch 'moving-to-lmdb'
This commit is contained in:
commit
f56636e1e9
78 changed files with 3369 additions and 3351 deletions
|
@ -1,44 +0,0 @@
|
|||
use lazy_static::lazy_static;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
|
||||
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
|
||||
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||
use self::PrefixSetting::{Prefix, NoPrefix};
|
||||
|
||||
match query.len() {
|
||||
0 ..= 4 => match setting {
|
||||
Prefix => LEVDIST0.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST0.build_dfa(query),
|
||||
},
|
||||
5 ..= 8 => match setting {
|
||||
Prefix => LEVDIST1.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST1.build_dfa(query),
|
||||
},
|
||||
_ => match setting {
|
||||
Prefix => LEVDIST2.build_prefix_dfa(query),
|
||||
NoPrefix => LEVDIST2.build_dfa(query),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_prefix_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
||||
}
|
||||
|
||||
pub fn build_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||
}
|
51
meilidb-core/src/automaton/dfa.rs
Normal file
51
meilidb-core/src/automaton/dfa.rs
Normal file
|
@ -0,0 +1,51 @@
|
|||
use once_cell::sync::OnceCell;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA,
|
||||
};
|
||||
|
||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||
use PrefixSetting::{Prefix, NoPrefix};
|
||||
|
||||
match query.len() {
|
||||
0 ..= 4 => {
|
||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, false));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
5 ..= 8 => {
|
||||
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, false));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, false));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_prefix_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
||||
}
|
||||
|
||||
pub fn build_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||
}
|
219
meilidb-core/src/automaton/mod.rs
Normal file
219
meilidb-core/src/automaton/mod.rs
Normal file
|
@ -0,0 +1,219 @@
|
|||
mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::vec;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilidb_tokenizer::{split_query_string, is_cjk};
|
||||
|
||||
use crate::store;
|
||||
use crate::error::MResult;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<Vec<Automaton>>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
pub fn new(
|
||||
reader: &impl rkv::Readable,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)>
|
||||
{
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
pub ngram: usize,
|
||||
pub query_len: usize,
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
pub query: String,
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
|
||||
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
||||
if !string.contains(is_cjk) {
|
||||
string = deunicode::deunicode_with_tofu(&string, "");
|
||||
}
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &impl rkv::Readable,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)>
|
||||
{
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
Some(synonym) => synonym,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let mut automaton_index = 0;
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
Automaton::exact(automaton_index, 1, word)
|
||||
} else {
|
||||
Automaton::prefix_exact(automaton_index, 1, word)
|
||||
};
|
||||
automaton_index += 1;
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(original_automatons);
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words { continue }
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
Automaton::exact(automaton_index, n, synonym)
|
||||
} else {
|
||||
Automaton::non_exact(automaton_index, n, synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_unstable_by_key(|a| {
|
||||
let a = a.first().unwrap();
|
||||
(Reverse(a.is_exact), Reverse(a.ngram))
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
398
meilidb-core/src/automaton/query_enhancer.rs
Normal file
398
meilidb-core/src/automaton/query_enhancer.rs
Normal file
|
@ -0,0 +1,398 @@
|
|||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
|
@ -10,7 +10,7 @@ impl Criterion for DocumentId {
|
|||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"DocumentId"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ impl Criterion for Exact {
|
|||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"Exact"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ mod words_proximity;
|
|||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
mod sort_by_attr;
|
||||
mod document_id;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
@ -16,13 +17,14 @@ pub use self::{
|
|||
sum_of_words_attribute::SumOfWordsAttribute,
|
||||
sum_of_words_position::SumOfWordsPosition,
|
||||
exact::Exact,
|
||||
sort_by_attr::SortByAttr,
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
|
||||
fn name(&self) -> &'static str;
|
||||
fn name(&self) -> &str;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
|
@ -35,7 +37,7 @@ impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
|||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
|
@ -49,7 +51,7 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
|||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ impl Criterion for NumberOfWords {
|
|||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"NumberOfWords"
|
||||
}
|
||||
}
|
||||
|
|
125
meilidb-core/src/criterion/sort_by_attr.rs
Normal file
125
meilidb-core/src/criterion/sort_by_attr.rs
Normal file
|
@ -0,0 +1,125 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{RawDocument, RankedMap};
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||
///
|
||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||
/// so you must check the [`Ord`] of `Option` implementation.
|
||||
///
|
||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilidb::rank::criterion::*;
|
||||
///
|
||||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
pub struct SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
attr: SchemaAttr,
|
||||
reversed: bool,
|
||||
}
|
||||
|
||||
impl<'a> SortByAttr<'a> {
|
||||
pub fn lower_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||
}
|
||||
|
||||
pub fn higher_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||
}
|
||||
|
||||
fn new(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
reversed: bool,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
let attr = match schema.attribute(attr_name) {
|
||||
Some(attr) => attr,
|
||||
None => return Err(SortByAttrError::AttributeNotFound),
|
||||
};
|
||||
|
||||
if !schema.props(attr).is_ranked() {
|
||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||
}
|
||||
|
||||
Ok(SortByAttr { ranked_map, attr, reversed })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = self.ranked_map.get(lhs.id, self.attr);
|
||||
let rhs = self.ranked_map.get(rhs.id, self.attr);
|
||||
|
||||
match (lhs, rhs) {
|
||||
(Some(lhs), Some(rhs)) => {
|
||||
let order = lhs.cmp(&rhs);
|
||||
if self.reversed { order.reverse() } else { order }
|
||||
},
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SortByAttr"
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SortByAttrError {
|
||||
AttributeNotFound,
|
||||
AttributeNotRegisteredForRanking,
|
||||
}
|
||||
|
||||
impl fmt::Display for SortByAttrError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use SortByAttrError::*;
|
||||
match self {
|
||||
AttributeNotFound => f.write_str("attribute not found in the schema"),
|
||||
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SortByAttrError { }
|
|
@ -54,7 +54,7 @@ impl Criterion for SumOfTypos {
|
|||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"SumOfTypos"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ impl Criterion for SumOfWordsAttribute {
|
|||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsAttribute"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ impl Criterion for SumOfWordsPosition {
|
|||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsPosition"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ impl Criterion for WordsProximity {
|
|||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
fn name(&self) -> &str {
|
||||
"WordsProximity"
|
||||
}
|
||||
}
|
||||
|
|
177
meilidb-core/src/database.rs
Normal file
177
meilidb-core/src/database.rs
Normal file
|
@ -0,0 +1,177 @@
|
|||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fs, thread};
|
||||
|
||||
use crossbeam_channel::Receiver;
|
||||
use log::{debug, error};
|
||||
|
||||
use crate::{store, update, Index, MResult};
|
||||
|
||||
pub type BoxUpdateFn = Box<dyn Fn(update::UpdateResult) + Send + Sync + 'static>;
|
||||
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
|
||||
|
||||
pub struct Database {
|
||||
pub rkv: Arc<RwLock<rkv::Rkv>>,
|
||||
main_store: rkv::SingleStore,
|
||||
indexes_store: rkv::SingleStore,
|
||||
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
|
||||
}
|
||||
|
||||
fn update_awaiter(
|
||||
receiver: Receiver<()>,
|
||||
rkv: Arc<RwLock<rkv::Rkv>>,
|
||||
update_fn: Arc<ArcSwapFn>,
|
||||
index: Index,
|
||||
)
|
||||
{
|
||||
for () in receiver {
|
||||
// consume all updates in order (oldest first)
|
||||
loop {
|
||||
let rkv = match rkv.read() {
|
||||
Ok(rkv) => rkv,
|
||||
Err(e) => { error!("rkv RwLock read failed: {}", e); break }
|
||||
};
|
||||
|
||||
let mut writer = match rkv.write() {
|
||||
Ok(writer) => writer,
|
||||
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
|
||||
};
|
||||
|
||||
match update::update_task(&mut writer, index.clone()) {
|
||||
Ok(Some(status)) => {
|
||||
if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) }
|
||||
|
||||
if let Some(ref callback) = *update_fn.load() {
|
||||
(callback)(status);
|
||||
}
|
||||
},
|
||||
// no more updates to handle for now
|
||||
Ok(None) => { debug!("no more updates"); writer.abort(); break },
|
||||
Err(e) => { error!("update task failed: {}", e); writer.abort() },
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> {
|
||||
let manager = rkv::Manager::singleton();
|
||||
let mut rkv_write = manager.write().unwrap();
|
||||
|
||||
fs::create_dir_all(path.as_ref())?;
|
||||
|
||||
let rkv = rkv_write
|
||||
.get_or_create(path.as_ref(), |path| {
|
||||
let mut builder = rkv::Rkv::environment_builder();
|
||||
builder.set_max_dbs(3000).set_map_size(10 * 1024 * 1024 * 1024); // 10GB
|
||||
rkv::Rkv::from_env(path, builder)
|
||||
})?;
|
||||
|
||||
drop(rkv_write);
|
||||
|
||||
let rkv_read = rkv.read().unwrap();
|
||||
let create_options = rkv::store::Options::create();
|
||||
let main_store = rkv_read.open_single("main", create_options)?;
|
||||
let indexes_store = rkv_read.open_single("indexes", create_options)?;
|
||||
|
||||
// list all indexes that needs to be opened
|
||||
let mut must_open = Vec::new();
|
||||
let reader = rkv_read.read()?;
|
||||
for result in indexes_store.iter_start(&reader)? {
|
||||
let (key, _) = result?;
|
||||
if let Ok(index_name) = std::str::from_utf8(key) {
|
||||
must_open.push(index_name.to_owned());
|
||||
}
|
||||
}
|
||||
|
||||
drop(reader);
|
||||
|
||||
// open the previously aggregated indexes
|
||||
let mut indexes = HashMap::new();
|
||||
for index_name in must_open {
|
||||
|
||||
let (sender, receiver) = crossbeam_channel::bounded(100);
|
||||
let index = store::open(&rkv_read, &index_name, sender.clone())?;
|
||||
let update_fn = Arc::new(ArcSwapFn::empty());
|
||||
|
||||
let rkv_clone = rkv.clone();
|
||||
let index_clone = index.clone();
|
||||
let update_fn_clone = update_fn.clone();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
update_awaiter(receiver, rkv_clone, update_fn_clone, index_clone)
|
||||
});
|
||||
|
||||
// send an update notification to make sure that
|
||||
// possible previous boot updates are consumed
|
||||
sender.send(()).unwrap();
|
||||
|
||||
let result = indexes.insert(index_name, (index, update_fn, handle));
|
||||
assert!(result.is_none(), "The index should not have been already open");
|
||||
}
|
||||
|
||||
drop(rkv_read);
|
||||
|
||||
Ok(Database { rkv, main_store, indexes_store, indexes: RwLock::new(indexes) })
|
||||
}
|
||||
|
||||
pub fn open_index(
|
||||
&self,
|
||||
name: impl Into<String>,
|
||||
update_fn: Option<BoxUpdateFn>,
|
||||
) -> MResult<Index>
|
||||
{
|
||||
let indexes_lock = self.indexes.read().unwrap();
|
||||
let name = name.into();
|
||||
|
||||
match indexes_lock.get(&name) {
|
||||
Some((index, old_update_fn, _)) => {
|
||||
old_update_fn.swap(update_fn.map(Arc::new));
|
||||
Ok(index.clone())
|
||||
},
|
||||
None => {
|
||||
drop(indexes_lock);
|
||||
|
||||
let rkv_lock = self.rkv.read().unwrap();
|
||||
let (sender, receiver) = crossbeam_channel::bounded(100);
|
||||
let index = store::create(&rkv_lock, &name, sender)?;
|
||||
|
||||
let mut writer = rkv_lock.write()?;
|
||||
let value = rkv::Value::Blob(&[]);
|
||||
self.indexes_store.put(&mut writer, &name, &value)?;
|
||||
|
||||
{
|
||||
let mut indexes_write = self.indexes.write().unwrap();
|
||||
indexes_write.entry(name).or_insert_with(|| {
|
||||
let rkv_clone = self.rkv.clone();
|
||||
let index_clone = index.clone();
|
||||
|
||||
let update_fn = update_fn.map(Arc::new);
|
||||
let update_fn = Arc::new(ArcSwapFn::new(update_fn));
|
||||
let update_fn_clone = update_fn.clone();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
update_awaiter(receiver, rkv_clone, update_fn_clone, index_clone)
|
||||
});
|
||||
|
||||
(index.clone(), update_fn, handle)
|
||||
});
|
||||
}
|
||||
|
||||
writer.commit()?;
|
||||
|
||||
Ok(index)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn indexes_names(&self) -> MResult<Vec<String>> {
|
||||
let indexes = self.indexes.read().unwrap();
|
||||
Ok(indexes.keys().cloned().collect())
|
||||
}
|
||||
|
||||
pub fn main_store(&self) -> rkv::SingleStore {
|
||||
self.main_store
|
||||
}
|
||||
}
|
112
meilidb-core/src/error.rs
Normal file
112
meilidb-core/src/error.rs
Normal file
|
@ -0,0 +1,112 @@
|
|||
use std::{error, fmt, io};
|
||||
use crate::serde::{SerializerError, DeserializerError};
|
||||
|
||||
pub type MResult<T> = Result<T, Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
Io(io::Error),
|
||||
SchemaDiffer,
|
||||
SchemaMissing,
|
||||
WordIndexMissing,
|
||||
MissingDocumentId,
|
||||
Rkv(rkv::StoreError),
|
||||
Fst(fst::Error),
|
||||
RmpDecode(rmp_serde::decode::Error),
|
||||
RmpEncode(rmp_serde::encode::Error),
|
||||
Bincode(bincode::Error),
|
||||
Serializer(SerializerError),
|
||||
Deserializer(DeserializerError),
|
||||
UnsupportedOperation(UnsupportedOperation),
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(error: io::Error) -> Error {
|
||||
Error::Io(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<rkv::StoreError> for Error {
|
||||
fn from(error: rkv::StoreError) -> Error {
|
||||
Error::Rkv(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<fst::Error> for Error {
|
||||
fn from(error: fst::Error) -> Error {
|
||||
Error::Fst(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<rmp_serde::decode::Error> for Error {
|
||||
fn from(error: rmp_serde::decode::Error) -> Error {
|
||||
Error::RmpDecode(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<rmp_serde::encode::Error> for Error {
|
||||
fn from(error: rmp_serde::encode::Error) -> Error {
|
||||
Error::RmpEncode(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<bincode::Error> for Error {
|
||||
fn from(error: bincode::Error) -> Error {
|
||||
Error::Bincode(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SerializerError> for Error {
|
||||
fn from(error: SerializerError) -> Error {
|
||||
Error::Serializer(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeserializerError> for Error {
|
||||
fn from(error: DeserializerError) -> Error {
|
||||
Error::Deserializer(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<UnsupportedOperation> for Error {
|
||||
fn from(op: UnsupportedOperation) -> Error {
|
||||
Error::UnsupportedOperation(op)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use self::Error::*;
|
||||
match self {
|
||||
Io(e) => write!(f, "{}", e),
|
||||
SchemaDiffer => write!(f, "schemas differ"),
|
||||
SchemaMissing => write!(f, "this index does not have a schema"),
|
||||
WordIndexMissing => write!(f, "this index does not have a word index"),
|
||||
MissingDocumentId => write!(f, "document id is missing"),
|
||||
Rkv(e) => write!(f, "rkv error; {}", e),
|
||||
Fst(e) => write!(f, "fst error; {}", e),
|
||||
RmpDecode(e) => write!(f, "rmp decode error; {}", e),
|
||||
RmpEncode(e) => write!(f, "rmp encode error; {}", e),
|
||||
Bincode(e) => write!(f, "bincode error; {}", e),
|
||||
Serializer(e) => write!(f, "serializer error; {}", e),
|
||||
Deserializer(e) => write!(f, "deserializer error; {}", e),
|
||||
UnsupportedOperation(op) => write!(f, "unsupported operation; {}", op),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for Error { }
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum UnsupportedOperation {
|
||||
SchemaAlreadyExists,
|
||||
}
|
||||
|
||||
impl fmt::Display for UnsupportedOperation {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use self::UnsupportedOperation::*;
|
||||
match self {
|
||||
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,25 +1,31 @@
|
|||
#![feature(checked_duration_since)]
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use] extern crate assert_matches;
|
||||
|
||||
mod automaton;
|
||||
mod database;
|
||||
mod distinct_map;
|
||||
mod error;
|
||||
mod number;
|
||||
mod query_builder;
|
||||
mod query_enhancer;
|
||||
mod ranked_map;
|
||||
mod raw_document;
|
||||
mod reordered_attrs;
|
||||
mod store;
|
||||
mod update;
|
||||
pub mod criterion;
|
||||
pub mod raw_indexer;
|
||||
pub mod serde;
|
||||
pub mod store;
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
use self::raw_document::raw_documents_from;
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||
pub use self::database::{Database, BoxUpdateFn};
|
||||
pub use self::error::{Error, MResult};
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
pub use self::ranked_map::RankedMap;
|
||||
pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Store;
|
||||
pub use self::store::Index;
|
||||
pub use self::update::{UpdateStatus, UpdateResult};
|
||||
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
use ::serde::{Serialize, Deserialize};
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
|
|
55
meilidb-core/src/number.rs
Normal file
55
meilidb-core/src/number.rs
Normal file
|
@ -0,0 +1,55 @@
|
|||
use std::num::{ParseIntError, ParseFloatError};
|
||||
use std::str::FromStr;
|
||||
use std::fmt;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Number {
|
||||
Unsigned(u64),
|
||||
Signed(i64),
|
||||
Float(OrderedFloat<f64>),
|
||||
}
|
||||
|
||||
impl FromStr for Number {
|
||||
type Err = ParseNumberError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let uint_error = match u64::from_str(s) {
|
||||
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
let int_error = match i64::from_str(s) {
|
||||
Ok(signed) => return Ok(Number::Signed(signed)),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
let float_error = match f64::from_str(s) {
|
||||
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
Err(ParseNumberError { uint_error, int_error, float_error })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ParseNumberError {
|
||||
uint_error: ParseIntError,
|
||||
int_error: ParseIntError,
|
||||
float_error: ParseFloatError,
|
||||
}
|
||||
|
||||
impl fmt::Display for ParseNumberError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.uint_error == self.int_error {
|
||||
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
|
||||
} else {
|
||||
write!(f, "can not parse number: {}, {}, {}",
|
||||
self.uint_error, self.int_error, self.float_error)
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
35
meilidb-core/src/ranked_map.rs
Normal file
35
meilidb-core/src/ranked_map.rs
Normal file
|
@ -0,0 +1,35 @@
|
|||
use std::io::{Read, Write};
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
use crate::{DocumentId, Number};
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
||||
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
|
||||
|
||||
impl RankedMap {
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
|
||||
self.0.insert((document, attribute), number);
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
|
||||
self.0.remove(&(document, attribute));
|
||||
}
|
||||
|
||||
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
|
||||
self.0.get(&(document, attribute)).cloned()
|
||||
}
|
||||
|
||||
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
|
||||
bincode::deserialize_from(reader).map(RankedMap)
|
||||
}
|
||||
|
||||
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
bincode::serialize_into(writer, &self.0)
|
||||
}
|
||||
}
|
208
meilidb-core/src/raw_indexer.rs
Normal file
208
meilidb-core/src/raw_indexer.rs
Normal file
|
@ -0,0 +1,208 @@
|
|||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use deunicode::deunicode_with_tofu;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||
use sdset::SetBuf;
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
|
||||
pub struct RawIndexer {
|
||||
word_limit: usize, // the maximum number of indexed words
|
||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||
}
|
||||
|
||||
pub struct Indexed {
|
||||
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
||||
pub docs_words: HashMap<DocumentId, fst::Set>,
|
||||
}
|
||||
|
||||
impl RawIndexer {
|
||||
pub fn new() -> RawIndexer {
|
||||
RawIndexer::with_word_limit(1000)
|
||||
}
|
||||
|
||||
pub fn with_word_limit(limit: usize) -> RawIndexer {
|
||||
RawIndexer {
|
||||
word_limit: limit,
|
||||
words_doc_indexes: BTreeMap::new(),
|
||||
docs_words: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
|
||||
let lowercase_text = text.to_lowercase();
|
||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||
|
||||
// TODO compute the deunicoded version after the cjk check
|
||||
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
|
||||
Some(deunicoded)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let iter = Some(lowercase_text).into_iter().chain(next);
|
||||
|
||||
for text in iter {
|
||||
for token in Tokenizer::new(&text) {
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
id,
|
||||
attr,
|
||||
self.word_limit,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
|
||||
IT: Iterator<Item = &'a str> + Clone,
|
||||
{
|
||||
// TODO serialize this to one call to the SeqTokenizer loop
|
||||
|
||||
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
|
||||
let iter = lowercased.iter().map(|t| t.as_str());
|
||||
|
||||
for token in SeqTokenizer::new(iter) {
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
id,
|
||||
attr,
|
||||
self.word_limit,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
}
|
||||
|
||||
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
|
||||
if lowercase_text.contains(is_cjk) { return lowercase_text }
|
||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
|
||||
}).collect();
|
||||
let iter = deunicoded.iter().map(|t| t.as_str());
|
||||
|
||||
for token in SeqTokenizer::new(iter) {
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
id,
|
||||
attr,
|
||||
self.word_limit,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> Indexed {
|
||||
let words_doc_indexes = self.words_doc_indexes
|
||||
.into_iter()
|
||||
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
||||
.collect();
|
||||
|
||||
let docs_words = self.docs_words
|
||||
.into_iter()
|
||||
.map(|(id, mut words)| {
|
||||
words.sort_unstable();
|
||||
words.dedup();
|
||||
(id, fst::Set::from_iter(words).unwrap())
|
||||
})
|
||||
.collect();
|
||||
|
||||
Indexed { words_doc_indexes, docs_words }
|
||||
}
|
||||
}
|
||||
|
||||
fn index_token(
|
||||
token: Token,
|
||||
id: DocumentId,
|
||||
attr: SchemaAttr,
|
||||
word_limit: usize,
|
||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||
) -> bool
|
||||
{
|
||||
if token.word_index >= word_limit { return false }
|
||||
|
||||
match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word);
|
||||
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
|
||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||
},
|
||||
None => return false,
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
|
||||
let word_index = u16::try_from(token.word_index).ok()?;
|
||||
let char_index = u16::try_from(token.char_index).ok()?;
|
||||
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||
|
||||
let docindex = DocIndex {
|
||||
document_id: id,
|
||||
attribute: attr.0,
|
||||
word_index,
|
||||
char_index,
|
||||
char_length,
|
||||
};
|
||||
|
||||
Some(docindex)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn strange_apostrophe() {
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||
indexer.index_text(docid, attr, text);
|
||||
|
||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
|
||||
// with the ugly apostrophe...
|
||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strange_apostrophe_in_sequence() {
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
||||
indexer.index_text_seq(docid, attr, text);
|
||||
|
||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
|
||||
// with the ugly apostrophe...
|
||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
||||
}
|
||||
}
|
180
meilidb-core/src/serde/convert_to_number.rs
Normal file
180
meilidb-core/src/serde/convert_to_number.rs
Normal file
|
@ -0,0 +1,180 @@
|
|||
use std::str::FromStr;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use super::SerializerError;
|
||||
use crate::Number;
|
||||
|
||||
pub struct ConvertToNumber;
|
||||
|
||||
impl ser::Serializer for ConvertToNumber {
|
||||
type Ok = Number;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_char(self, _value: char) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "char" })
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value))
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value))
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(OrderedFloat(f64::from(value))))
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(OrderedFloat(value)))
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::from_str(value)?)
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "struct variant" })
|
||||
}
|
||||
}
|
176
meilidb-core/src/serde/convert_to_string.rs
Normal file
176
meilidb-core/src/serde/convert_to_string.rs
Normal file
|
@ -0,0 +1,176 @@
|
|||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use super::SerializerError;
|
||||
|
||||
pub struct ConvertToString;
|
||||
|
||||
impl ser::Serializer for ConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "boolean" })
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
}
|
||||
}
|
142
meilidb-core/src/serde/deserializer.rs
Normal file
142
meilidb-core/src/serde/deserializer.rs
Normal file
|
@ -0,0 +1,142 @@
|
|||
use std::collections::HashSet;
|
||||
use std::io::Cursor;
|
||||
use std::{fmt, error::Error};
|
||||
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
|
||||
use rmp_serde::decode::{Error as RmpError};
|
||||
use serde::{de, forward_to_deserialize_any};
|
||||
|
||||
use crate::store::DocumentsFields;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DeserializerError {
|
||||
RmpError(RmpError),
|
||||
RkvError(rkv::StoreError),
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl de::Error for DeserializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
DeserializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeserializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
|
||||
DeserializerError::RkvError(e) => write!(f, "rkv related error: {}", e),
|
||||
DeserializerError::Custom(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for DeserializerError {}
|
||||
|
||||
impl From<RmpError> for DeserializerError {
|
||||
fn from(error: RmpError) -> DeserializerError {
|
||||
DeserializerError::RmpError(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<rkv::StoreError> for DeserializerError {
|
||||
fn from(error: rkv::StoreError) -> DeserializerError {
|
||||
DeserializerError::RkvError(error)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Deserializer<'a, R> {
|
||||
pub document_id: DocumentId,
|
||||
pub reader: &'a R,
|
||||
pub documents_fields: DocumentsFields,
|
||||
pub schema: &'a Schema,
|
||||
pub attributes: Option<&'a HashSet<SchemaAttr>>,
|
||||
}
|
||||
|
||||
impl<'de, 'a, 'b, R: 'a> de::Deserializer<'de> for &'b mut Deserializer<'a, R>
|
||||
where R: rkv::Readable,
|
||||
{
|
||||
type Error = DeserializerError;
|
||||
|
||||
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
{
|
||||
self.deserialize_map(visitor)
|
||||
}
|
||||
|
||||
forward_to_deserialize_any! {
|
||||
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
|
||||
bytes byte_buf option unit unit_struct newtype_struct seq tuple
|
||||
tuple_struct struct enum identifier ignored_any
|
||||
}
|
||||
|
||||
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
{
|
||||
let mut error = None;
|
||||
|
||||
let iter = self.documents_fields
|
||||
.document_fields(self.reader, self.document_id)?
|
||||
.filter_map(|result| {
|
||||
let (attr, value) = match result {
|
||||
Ok(value) => value,
|
||||
Err(e) => { error = Some(e); return None },
|
||||
};
|
||||
|
||||
let is_displayed = self.schema.props(attr).is_displayed();
|
||||
if is_displayed && self.attributes.map_or(true, |f| f.contains(&attr)) {
|
||||
let attribute_name = self.schema.attribute_name(attr);
|
||||
Some((attribute_name, Value::new(value)))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let map_deserializer = de::value::MapDeserializer::new(iter);
|
||||
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
|
||||
|
||||
match error.take() {
|
||||
Some(error) => Err(error.into()),
|
||||
None => result,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
|
||||
|
||||
impl<A> Value<A> where A: AsRef<[u8]>
|
||||
{
|
||||
fn new(value: A) -> Value<A> {
|
||||
Value(RmpDeserializer::new(Cursor::new(value)))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
type Deserializer = Self;
|
||||
|
||||
fn into_deserializer(self) -> Self::Deserializer {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
|
||||
where A: AsRef<[u8]>,
|
||||
{
|
||||
type Error = RmpError;
|
||||
|
||||
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
{
|
||||
self.0.deserialize_any(visitor)
|
||||
}
|
||||
|
||||
forward_to_deserialize_any! {
|
||||
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
|
||||
bytes byte_buf option unit unit_struct newtype_struct seq tuple
|
||||
tuple_struct map struct enum identifier ignored_any
|
||||
}
|
||||
}
|
273
meilidb-core/src/serde/extract_document_id.rs
Normal file
273
meilidb-core/src/serde/extract_document_id.rs
Normal file
|
@ -0,0 +1,273 @@
|
|||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::DocumentId;
|
||||
use serde::{ser, Serialize};
|
||||
use serde_json::Value;
|
||||
use siphasher::sip::SipHasher;
|
||||
|
||||
use super::{SerializerError, ConvertToString};
|
||||
|
||||
pub fn extract_document_id<D>(
|
||||
identifier: &str,
|
||||
document: &D,
|
||||
) -> Result<Option<DocumentId>, SerializerError>
|
||||
where D: serde::Serialize,
|
||||
{
|
||||
let serializer = ExtractDocumentId { identifier };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn value_to_string(value: &Value) -> Option<String> {
|
||||
match value {
|
||||
Value::Null => None,
|
||||
Value::Bool(_) => None,
|
||||
Value::Number(value) => Some(value.to_string()),
|
||||
Value::String(value) => Some(value.to_string()),
|
||||
Value::Array(_) => None,
|
||||
Value::Object(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
|
||||
let mut s = SipHasher::new();
|
||||
t.hash(&mut s);
|
||||
let hash = s.finish();
|
||||
DocumentId(hash)
|
||||
}
|
||||
|
||||
struct ExtractDocumentId<'a> {
|
||||
identifier: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
|
||||
type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _value: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
let serializer = ExtractDocumentIdMapSerializer {
|
||||
identifier: self.identifier,
|
||||
document_id: None,
|
||||
current_key_name: None,
|
||||
};
|
||||
|
||||
Ok(serializer)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
let serializer = ExtractDocumentIdStructSerializer {
|
||||
identifier: self.identifier,
|
||||
document_id: None,
|
||||
};
|
||||
|
||||
Ok(serializer)
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractDocumentIdMapSerializer<'a> {
|
||||
identifier: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
if self.identifier == key {
|
||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||
match value_to_string(&value).map(|s| compute_document_id(&s)) {
|
||||
Some(document_id) => self.document_id = Some(document_id),
|
||||
None => return Err(SerializerError::InvalidDocumentIdType),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.document_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractDocumentIdStructSerializer<'a> {
|
||||
identifier: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if self.identifier == key {
|
||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||
match value_to_string(&value).map(compute_document_id) {
|
||||
Some(document_id) => self.document_id = Some(document_id),
|
||||
None => return Err(SerializerError::InvalidDocumentIdType),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.document_id)
|
||||
}
|
||||
}
|
336
meilidb-core/src/serde/indexer.rs
Normal file
336
meilidb-core/src/serde/indexer.rs
Normal file
|
@ -0,0 +1,336 @@
|
|||
use meilidb_schema::SchemaAttr;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use super::{SerializerError, ConvertToString};
|
||||
|
||||
pub struct Indexer<'a> {
|
||||
pub attribute: SchemaAttr,
|
||||
pub indexer: &'a mut RawIndexer,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for Indexer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = SeqIndexer<'a>;
|
||||
type SerializeTuple = TupleIndexer<'a>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapIndexer<'a>;
|
||||
type SerializeStruct = StructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "boolean" })
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||
self.indexer.index_text(self.document_id, self.attribute, text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.indexer.index_text(self.document_id, self.attribute, &text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
let indexer = SeqIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
let indexer = TupleIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
let indexer = MapIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SeqIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let text = key.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let key_text = key.to_owned();
|
||||
let value_text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(key_text);
|
||||
self.texts.push(value_text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TupleIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(())
|
||||
}
|
||||
}
|
130
meilidb-core/src/serde/mod.rs
Normal file
130
meilidb-core/src/serde/mod.rs
Normal file
|
@ -0,0 +1,130 @@
|
|||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
mod convert_to_number;
|
||||
mod convert_to_string;
|
||||
mod deserializer;
|
||||
mod extract_document_id;
|
||||
mod indexer;
|
||||
mod serializer;
|
||||
|
||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
|
||||
pub use self::convert_to_string::ConvertToString;
|
||||
pub use self::convert_to_number::ConvertToNumber;
|
||||
pub use self::indexer::Indexer;
|
||||
pub use self::serializer::Serializer;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{fmt, error::Error};
|
||||
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use rmp_serde::encode::Error as RmpError;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
use serde::ser;
|
||||
|
||||
use crate::{DocumentId, ParseNumberError};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
InvalidDocumentIdType,
|
||||
RmpError(RmpError),
|
||||
RkvError(rkv::StoreError),
|
||||
SerdeJsonError(SerdeJsonError),
|
||||
ParseNumberError(ParseNumberError),
|
||||
UnserializableType { type_name: &'static str },
|
||||
UnindexableType { type_name: &'static str },
|
||||
UnrankableType { type_name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
write!(f, "serialized document does not have an id according to the schema")
|
||||
},
|
||||
SerializerError::InvalidDocumentIdType => {
|
||||
write!(f, "document identifier can only be of type string or number")
|
||||
},
|
||||
SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
|
||||
SerializerError::RkvError(e) => write!(f, "rkv related error: {}", e),
|
||||
SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e),
|
||||
SerializerError::ParseNumberError(e) => {
|
||||
write!(f, "error while trying to parse a number: {}", e)
|
||||
},
|
||||
SerializerError::UnserializableType { type_name } => {
|
||||
write!(f, "{} is not a serializable type", type_name)
|
||||
},
|
||||
SerializerError::UnindexableType { type_name } => {
|
||||
write!(f, "{} is not an indexable type", type_name)
|
||||
},
|
||||
SerializerError::UnrankableType { type_name } => {
|
||||
write!(f, "{} types can not be used for ranking", type_name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
impl From<String> for SerializerError {
|
||||
fn from(value: String) -> SerializerError {
|
||||
SerializerError::Custom(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RmpError> for SerializerError {
|
||||
fn from(error: RmpError) -> SerializerError {
|
||||
SerializerError::RmpError(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SerdeJsonError> for SerializerError {
|
||||
fn from(error: SerdeJsonError) -> SerializerError {
|
||||
SerializerError::SerdeJsonError(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<rkv::StoreError> for SerializerError {
|
||||
fn from(error: rkv::StoreError) -> SerializerError {
|
||||
SerializerError::RkvError(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ParseNumberError> for SerializerError {
|
||||
fn from(error: ParseNumberError) -> SerializerError {
|
||||
SerializerError::ParseNumberError(error)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
|
||||
|
||||
impl RamDocumentStore {
|
||||
pub fn new() -> RamDocumentStore {
|
||||
RamDocumentStore(BTreeMap::new())
|
||||
}
|
||||
|
||||
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
|
||||
self.0.insert((id, attr), value);
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
|
||||
self.0
|
||||
}
|
||||
}
|
288
meilidb-core/src/serde/serializer.rs
Normal file
288
meilidb-core/src/serde/serializer.rs
Normal file
|
@ -0,0 +1,288 @@
|
|||
use meilidb_schema::Schema;
|
||||
use serde::ser;
|
||||
|
||||
use crate::{DocumentId, RankedMap};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::RamDocumentStore;
|
||||
|
||||
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
|
||||
|
||||
pub struct Serializer<'a> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_store: &'a mut RamDocumentStore,
|
||||
pub indexer: &'a mut RawIndexer,
|
||||
pub ranked_map: &'a mut RankedMap,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for Serializer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a>;
|
||||
type SerializeStruct = StructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
indexer: self.indexer,
|
||||
ranked_map: self.ranked_map,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
indexer: self.indexer,
|
||||
ranked_map: self.ranked_map,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a> {
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: &'a mut RamDocumentStore,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where K: ser::Serialize, V: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
serialize_value(
|
||||
self.schema,
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
&key,
|
||||
value,
|
||||
)
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a> {
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: &'a mut RamDocumentStore,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
serialize_value(
|
||||
self.schema,
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
key,
|
||||
value,
|
||||
)
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(
|
||||
schema: &Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: &mut RamDocumentStore,
|
||||
indexer: &mut RawIndexer,
|
||||
ranked_map: &mut RankedMap,
|
||||
key: &str,
|
||||
value: &T,
|
||||
) -> Result<(), SerializerError>
|
||||
where T: ser::Serialize,
|
||||
{
|
||||
if let Some(attribute) = schema.attribute(key) {
|
||||
let props = schema.props(attribute);
|
||||
|
||||
let serialized = rmp_serde::to_vec_named(value)?;
|
||||
document_store.set_document_field(document_id, attribute, serialized);
|
||||
|
||||
if props.is_indexed() {
|
||||
let indexer = Indexer { attribute, indexer, document_id };
|
||||
value.serialize(indexer)?;
|
||||
}
|
||||
|
||||
if props.is_ranked() {
|
||||
let number = value.serialize(ConvertToNumber)?;
|
||||
ranked_map.insert(document_id, attribute, number);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
use std::error::Error;
|
||||
use fst::Set;
|
||||
use sdset::SetBuf;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub trait Store {
|
||||
type Error: Error;
|
||||
|
||||
fn words(&self) -> Result<&Set, Self::Error>;
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error>;
|
||||
|
||||
fn synonyms(&self) -> Result<&Set, Self::Error>;
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error>;
|
||||
}
|
||||
|
||||
impl<T> Store for &'_ T where T: Store {
|
||||
type Error = T::Error;
|
||||
|
||||
fn words(&self) -> Result<&Set, Self::Error> {
|
||||
(*self).words()
|
||||
}
|
||||
|
||||
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
|
||||
(*self).word_indexes(word)
|
||||
}
|
||||
|
||||
fn synonyms(&self) -> Result<&Set, Self::Error> {
|
||||
(*self).synonyms()
|
||||
}
|
||||
|
||||
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
|
||||
(*self).alternatives_to(word)
|
||||
}
|
||||
}
|
55
meilidb-core/src/store/docs_words.rs
Normal file
55
meilidb-core/src/store/docs_words.rs
Normal file
|
@ -0,0 +1,55 @@
|
|||
use std::sync::Arc;
|
||||
use rkv::{Value, StoreError};
|
||||
use crate::{DocumentId, MResult};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocsWords {
|
||||
pub(crate) docs_words: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl DocsWords {
|
||||
pub fn put_doc_words(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
document_id: DocumentId,
|
||||
words: &fst::Set,
|
||||
) -> Result<(), rkv::StoreError>
|
||||
{
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
let bytes = words.as_fst().as_bytes();
|
||||
self.docs_words.put(writer, document_id_bytes, &Value::Blob(bytes))
|
||||
}
|
||||
|
||||
pub fn del_doc_words(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
document_id: DocumentId,
|
||||
) -> Result<bool, rkv::StoreError>
|
||||
{
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
match self.docs_words.delete(writer, document_id_bytes) {
|
||||
Ok(()) => Ok(true),
|
||||
Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn doc_words<T: rkv::Readable>(
|
||||
&self,
|
||||
reader: &T,
|
||||
document_id: DocumentId,
|
||||
) -> MResult<Option<fst::Set>>
|
||||
{
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
match self.docs_words.get(reader, document_id_bytes)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?;
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
127
meilidb-core/src/store/documents_fields.rs
Normal file
127
meilidb-core/src/store/documents_fields.rs
Normal file
|
@ -0,0 +1,127 @@
|
|||
use std::convert::TryFrom;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFields {
|
||||
pub(crate) documents_fields: rkv::SingleStore,
|
||||
}
|
||||
|
||||
fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] {
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
let attr_bytes = attribute.0.to_be_bytes();
|
||||
|
||||
let mut key = [0u8; 10];
|
||||
key[0..8].copy_from_slice(&document_id_bytes);
|
||||
key[8..10].copy_from_slice(&attr_bytes);
|
||||
|
||||
key
|
||||
}
|
||||
|
||||
fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
|
||||
let document_id = {
|
||||
let array = TryFrom::try_from(&key[0..8]).unwrap();
|
||||
DocumentId(u64::from_be_bytes(array))
|
||||
};
|
||||
|
||||
let schema_attr = {
|
||||
let array = TryFrom::try_from(&key[8..8+2]).unwrap();
|
||||
SchemaAttr(u16::from_be_bytes(array))
|
||||
};
|
||||
|
||||
(document_id, schema_attr)
|
||||
}
|
||||
|
||||
impl DocumentsFields {
|
||||
pub fn put_document_field(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: &[u8],
|
||||
) -> Result<(), rkv::StoreError>
|
||||
{
|
||||
let key = document_attribute_into_key(document_id, attribute);
|
||||
self.documents_fields.put(writer, key, &rkv::Value::Blob(value))
|
||||
}
|
||||
|
||||
pub fn del_all_document_fields(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
document_id: DocumentId,
|
||||
) -> Result<usize, rkv::StoreError>
|
||||
{
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
let mut keys_to_delete = Vec::new();
|
||||
|
||||
// WARN we can not delete the keys using the iterator
|
||||
// so we store them and delete them just after
|
||||
let iter = self.documents_fields.iter_from(writer, document_id_bytes)?;
|
||||
for result in iter {
|
||||
let (key, _) = result?;
|
||||
let array = TryFrom::try_from(key).unwrap();
|
||||
let (current_document_id, _) = document_attribute_from_key(array);
|
||||
if current_document_id != document_id { break }
|
||||
|
||||
keys_to_delete.push(key.to_owned());
|
||||
}
|
||||
|
||||
let count = keys_to_delete.len();
|
||||
for key in keys_to_delete {
|
||||
self.documents_fields.delete(writer, key)?;
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub fn document_attribute<'a>(
|
||||
&self,
|
||||
reader: &'a impl rkv::Readable,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> Result<Option<&'a [u8]>, rkv::StoreError>
|
||||
{
|
||||
let key = document_attribute_into_key(document_id, attribute);
|
||||
|
||||
match self.documents_fields.get(reader, key)? {
|
||||
Some(rkv::Value::Blob(bytes)) => Ok(Some(bytes)),
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_fields<'r, T: rkv::Readable>(
|
||||
&self,
|
||||
reader: &'r T,
|
||||
document_id: DocumentId,
|
||||
) -> Result<DocumentFieldsIter<'r>, rkv::StoreError>
|
||||
{
|
||||
let document_id_bytes = document_id.0.to_be_bytes();
|
||||
let iter = self.documents_fields.iter_from(reader, document_id_bytes)?;
|
||||
Ok(DocumentFieldsIter { document_id, iter })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentFieldsIter<'r> {
|
||||
document_id: DocumentId,
|
||||
iter: rkv::store::single::Iter<'r>,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for DocumentFieldsIter<'r> {
|
||||
type Item = Result<(SchemaAttr, &'r [u8]), rkv::StoreError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok((key, Some(rkv::Value::Blob(bytes))))) => {
|
||||
let array = TryFrom::try_from(key).unwrap();
|
||||
let (current_document_id, attr) = document_attribute_from_key(array);
|
||||
if current_document_id != self.document_id { return None; }
|
||||
|
||||
Some(Ok((attr, bytes)))
|
||||
},
|
||||
Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data),
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
154
meilidb-core/src/store/main.rs
Normal file
154
meilidb-core/src/store/main.rs
Normal file
|
@ -0,0 +1,154 @@
|
|||
use std::sync::Arc;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use meilidb_schema::Schema;
|
||||
use rkv::Value;
|
||||
use crate::{RankedMap, MResult};
|
||||
|
||||
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||
const SCHEMA_KEY: &str = "schema";
|
||||
const SYNONYMS_KEY: &str = "synonyms";
|
||||
const WORDS_KEY: &str = "words";
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Main {
|
||||
pub(crate) main: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl Main {
|
||||
pub fn put_words_fst(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
fst: &fst::Set,
|
||||
) -> Result<(), rkv::StoreError>
|
||||
{
|
||||
let blob = rkv::Value::Blob(fst.as_fst().as_bytes());
|
||||
self.main.put(writer, WORDS_KEY, &blob)
|
||||
}
|
||||
|
||||
pub fn words_fst(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
) -> MResult<Option<fst::Set>>
|
||||
{
|
||||
match self.main.get(reader, WORDS_KEY)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?;
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_schema(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
schema: &Schema,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let bytes = bincode::serialize(schema)?;
|
||||
let blob = Value::Blob(&bytes[..]);
|
||||
self.main.put(writer, SCHEMA_KEY, &blob)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schema(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
) -> MResult<Option<Schema>>
|
||||
{
|
||||
match self.main.get(reader, SCHEMA_KEY)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let schema = bincode::deserialize_from(bytes.as_ref())?;
|
||||
Ok(Some(schema))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_ranked_map(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
ranked_map: &RankedMap,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let mut bytes = Vec::new();
|
||||
ranked_map.write_to_bin(&mut bytes)?;
|
||||
let blob = Value::Blob(&bytes[..]);
|
||||
self.main.put(writer, RANKED_MAP_KEY, &blob)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn ranked_map(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
) -> MResult<Option<RankedMap>>
|
||||
{
|
||||
match self.main.get(reader, RANKED_MAP_KEY)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let ranked_map = RankedMap::read_from_bin(bytes)?;
|
||||
Ok(Some(ranked_map))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_synonyms_fst(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
fst: &fst::Set,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let blob = rkv::Value::Blob(fst.as_fst().as_bytes());
|
||||
Ok(self.main.put(writer, SYNONYMS_KEY, &blob)?)
|
||||
}
|
||||
|
||||
pub fn synonyms_fst(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
) -> MResult<Option<fst::Set>>
|
||||
{
|
||||
match self.main.get(reader, SYNONYMS_KEY)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?;
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_number_of_documents<F: Fn(u64) -> u64>(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
f: F,
|
||||
) -> Result<u64, rkv::StoreError>
|
||||
{
|
||||
let new = self.number_of_documents(writer).map(f)?;
|
||||
self.main.put(writer, NUMBER_OF_DOCUMENTS_KEY, &Value::Blob(&new.to_be_bytes()))?;
|
||||
Ok(new)
|
||||
}
|
||||
|
||||
pub fn number_of_documents(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
) -> Result<u64, rkv::StoreError>
|
||||
{
|
||||
match self.main.get(reader, NUMBER_OF_DOCUMENTS_KEY)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let array = bytes.try_into().unwrap();
|
||||
Ok(u64::from_be_bytes(array))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(0),
|
||||
}
|
||||
}
|
||||
}
|
224
meilidb-core/src/store/mod.rs
Normal file
224
meilidb-core/src/store/mod.rs
Normal file
|
@ -0,0 +1,224 @@
|
|||
mod docs_words;
|
||||
mod documents_fields;
|
||||
mod main;
|
||||
mod postings_lists;
|
||||
mod synonyms;
|
||||
mod updates;
|
||||
mod updates_results;
|
||||
|
||||
pub use self::docs_words::DocsWords;
|
||||
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
|
||||
pub use self::main::Main;
|
||||
pub use self::postings_lists::PostingsLists;
|
||||
pub use self::synonyms::Synonyms;
|
||||
pub use self::updates::Updates;
|
||||
pub use self::updates_results::UpdatesResults;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use serde::de;
|
||||
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
||||
use crate::serde::Deserializer;
|
||||
|
||||
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
||||
(bytes as *const _ as *const () as usize) % align == 0
|
||||
}
|
||||
|
||||
fn main_name(name: &str) -> String {
|
||||
format!("store-{}", name)
|
||||
}
|
||||
|
||||
fn postings_lists_name(name: &str) -> String {
|
||||
format!("store-{}-postings-lists", name)
|
||||
}
|
||||
|
||||
fn documents_fields_name(name: &str) -> String {
|
||||
format!("store-{}-documents-fields", name)
|
||||
}
|
||||
|
||||
fn synonyms_name(name: &str) -> String {
|
||||
format!("store-{}-synonyms", name)
|
||||
}
|
||||
|
||||
fn docs_words_name(name: &str) -> String {
|
||||
format!("store-{}-docs-words", name)
|
||||
}
|
||||
|
||||
fn updates_name(name: &str) -> String {
|
||||
format!("store-{}-updates", name)
|
||||
}
|
||||
|
||||
fn updates_results_name(name: &str) -> String {
|
||||
format!("store-{}-updates-results", name)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Index {
|
||||
pub main: Main,
|
||||
pub postings_lists: PostingsLists,
|
||||
pub documents_fields: DocumentsFields,
|
||||
pub synonyms: Synonyms,
|
||||
pub docs_words: DocsWords,
|
||||
|
||||
pub updates: Updates,
|
||||
pub updates_results: UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn document<T: de::DeserializeOwned, R: rkv::Readable>(
|
||||
&self,
|
||||
reader: &R,
|
||||
attributes: Option<&HashSet<&str>>,
|
||||
document_id: DocumentId,
|
||||
) -> MResult<Option<T>>
|
||||
{
|
||||
let schema = self.main.schema(reader)?;
|
||||
let schema = schema.ok_or(Error::SchemaMissing)?;
|
||||
|
||||
let attributes = match attributes {
|
||||
Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut deserializer = Deserializer {
|
||||
document_id,
|
||||
reader,
|
||||
documents_fields: self.documents_fields,
|
||||
schema: &schema,
|
||||
attributes: attributes.as_ref(),
|
||||
};
|
||||
|
||||
// TODO: currently we return an error if all document fields are missing,
|
||||
// returning None would have been better
|
||||
Ok(T::deserialize(&mut deserializer).map(Some)?)
|
||||
}
|
||||
|
||||
pub fn document_attribute<T: de::DeserializeOwned, R: rkv::Readable>(
|
||||
&self,
|
||||
reader: &R,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> MResult<Option<T>>
|
||||
{
|
||||
let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?;
|
||||
match bytes {
|
||||
Some(bytes) => Ok(Some(rmp_serde::from_read_ref(bytes)?)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn schema_update(&self, mut writer: rkv::Writer, schema: Schema) -> MResult<()> {
|
||||
update::push_schema_update(&mut writer, self.updates, self.updates_results, schema)?;
|
||||
writer.commit()?;
|
||||
let _ = self.updates_notifier.send(());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn documents_addition<D>(&self) -> update::DocumentsAddition<D> {
|
||||
update::DocumentsAddition::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn documents_deletion<D>(&self) -> update::DocumentsDeletion {
|
||||
update::DocumentsDeletion::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
|
||||
update::SynonymsAddition::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn synonyms_deletion(&self) -> update::SynonymsDeletion {
|
||||
update::SynonymsDeletion::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn update_status<T: rkv::Readable>(
|
||||
&self,
|
||||
reader: &T,
|
||||
update_id: u64,
|
||||
) -> MResult<update::UpdateStatus>
|
||||
{
|
||||
update::update_status(
|
||||
reader,
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
update_id,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> QueryBuilder {
|
||||
QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create(
|
||||
env: &rkv::Rkv,
|
||||
name: &str,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> Result<Index, rkv::StoreError>
|
||||
{
|
||||
open_options(env, name, rkv::StoreOptions::create(), updates_notifier)
|
||||
}
|
||||
|
||||
pub fn open(
|
||||
env: &rkv::Rkv,
|
||||
name: &str,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> Result<Index, rkv::StoreError>
|
||||
{
|
||||
let mut options = rkv::StoreOptions::default();
|
||||
options.create = false;
|
||||
open_options(env, name, options, updates_notifier)
|
||||
}
|
||||
|
||||
fn open_options(
|
||||
env: &rkv::Rkv,
|
||||
name: &str,
|
||||
options: rkv::StoreOptions,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> Result<Index, rkv::StoreError>
|
||||
{
|
||||
// create all the store names
|
||||
let main_name = main_name(name);
|
||||
let postings_lists_name = postings_lists_name(name);
|
||||
let documents_fields_name = documents_fields_name(name);
|
||||
let synonyms_name = synonyms_name(name);
|
||||
let docs_words_name = docs_words_name(name);
|
||||
let updates_name = updates_name(name);
|
||||
let updates_results_name = updates_results_name(name);
|
||||
|
||||
// open all the stores
|
||||
let main = env.open_single(main_name.as_str(), options)?;
|
||||
let postings_lists = env.open_single(postings_lists_name.as_str(), options)?;
|
||||
let documents_fields = env.open_single(documents_fields_name.as_str(), options)?;
|
||||
let synonyms = env.open_single(synonyms_name.as_str(), options)?;
|
||||
let docs_words = env.open_single(docs_words_name.as_str(), options)?;
|
||||
let updates = env.open_single(updates_name.as_str(), options)?;
|
||||
let updates_results = env.open_single(updates_results_name.as_str(), options)?;
|
||||
|
||||
Ok(Index {
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
updates: Updates { updates },
|
||||
updates_results: UpdatesResults { updates_results },
|
||||
updates_notifier,
|
||||
})
|
||||
}
|
81
meilidb-core/src/store/postings_lists.rs
Normal file
81
meilidb-core/src/store/postings_lists.rs
Normal file
|
@ -0,0 +1,81 @@
|
|||
use std::borrow::Cow;
|
||||
use std::{mem, ptr};
|
||||
|
||||
use zerocopy::{AsBytes, LayoutVerified};
|
||||
use rkv::StoreError;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::store::aligned_to;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PostingsLists {
|
||||
pub(crate) postings_lists: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl PostingsLists {
|
||||
pub fn put_postings_list(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
word: &[u8],
|
||||
words_indexes: &[DocIndex],
|
||||
) -> Result<(), rkv::StoreError>
|
||||
{
|
||||
let blob = rkv::Value::Blob(words_indexes.as_bytes());
|
||||
self.postings_lists.put(writer, word, &blob)
|
||||
}
|
||||
|
||||
pub fn del_postings_list(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
word: &[u8],
|
||||
) -> Result<bool, rkv::StoreError>
|
||||
{
|
||||
match self.postings_lists.delete(writer, word) {
|
||||
Ok(()) => Ok(true),
|
||||
Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn postings_list<'a>(
|
||||
&self,
|
||||
reader: &'a impl rkv::Readable,
|
||||
word: &[u8],
|
||||
) -> Result<Option<Cow<'a, sdset::Set<DocIndex>>>, rkv::StoreError>
|
||||
{
|
||||
let bytes = match self.postings_lists.get(reader, word)? {
|
||||
Some(rkv::Value::Blob(bytes)) => bytes,
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match LayoutVerified::new_slice(bytes) {
|
||||
Some(layout) => {
|
||||
let set = sdset::Set::new(layout.into_slice()).unwrap();
|
||||
Ok(Some(Cow::Borrowed(set)))
|
||||
},
|
||||
None => {
|
||||
let len = bytes.len();
|
||||
let elem_size = mem::size_of::<DocIndex>();
|
||||
|
||||
// ensure that it is the alignment that is wrong
|
||||
// and the length is valid
|
||||
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<DocIndex>()) {
|
||||
let elems = len / elem_size;
|
||||
let mut vec = Vec::<DocIndex>::with_capacity(elems);
|
||||
|
||||
unsafe {
|
||||
let dst = vec.as_mut_ptr() as *mut u8;
|
||||
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
|
||||
vec.set_len(elems);
|
||||
}
|
||||
|
||||
let setbuf = sdset::SetBuf::new(vec).unwrap();
|
||||
return Ok(Some(Cow::Owned(setbuf)))
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
52
meilidb-core/src/store/synonyms.rs
Normal file
52
meilidb-core/src/store/synonyms.rs
Normal file
|
@ -0,0 +1,52 @@
|
|||
use std::sync::Arc;
|
||||
use rkv::StoreError;
|
||||
use crate::error::MResult;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Synonyms {
|
||||
pub(crate) synonyms: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl Synonyms {
|
||||
pub fn put_synonyms(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
word: &[u8],
|
||||
synonyms: &fst::Set,
|
||||
) -> Result<(), rkv::StoreError>
|
||||
{
|
||||
let blob = rkv::Value::Blob(synonyms.as_fst().as_bytes());
|
||||
self.synonyms.put(writer, word, &blob)
|
||||
}
|
||||
|
||||
pub fn del_synonyms(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
word: &[u8],
|
||||
) -> Result<bool, rkv::StoreError>
|
||||
{
|
||||
match self.synonyms.delete(writer, word) {
|
||||
Ok(()) => Ok(true),
|
||||
Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn synonyms(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
word: &[u8],
|
||||
) -> MResult<Option<fst::Set>>
|
||||
{
|
||||
match self.synonyms.get(reader, word)? {
|
||||
Some(rkv::Value::Blob(bytes)) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?;
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
101
meilidb-core/src/store/updates.rs
Normal file
101
meilidb-core/src/store/updates.rs
Normal file
|
@ -0,0 +1,101 @@
|
|||
use std::convert::TryInto;
|
||||
use rkv::Value;
|
||||
use crate::{update::Update, MResult};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Updates {
|
||||
pub(crate) updates: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl Updates {
|
||||
// TODO we should use the MDB_LAST op but
|
||||
// it is not exposed by the rkv library
|
||||
pub fn last_update_id<'a>(
|
||||
&self,
|
||||
reader: &'a impl rkv::Readable,
|
||||
) -> Result<Option<(u64, Option<Value<'a>>)>, rkv::StoreError>
|
||||
{
|
||||
let mut last = None;
|
||||
let iter = self.updates.iter_start(reader)?;
|
||||
for result in iter {
|
||||
let (key, data) = result?;
|
||||
last = Some((key, data));
|
||||
}
|
||||
|
||||
let (last_key, last_data) = match last {
|
||||
Some(entry) => entry,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let array = last_key.try_into().unwrap();
|
||||
let number = u64::from_be_bytes(array);
|
||||
|
||||
Ok(Some((number, last_data)))
|
||||
}
|
||||
|
||||
fn first_update_id<'a>(
|
||||
&self,
|
||||
reader: &'a impl rkv::Readable,
|
||||
) -> Result<Option<(u64, Option<Value<'a>>)>, rkv::StoreError>
|
||||
{
|
||||
let mut iter = self.updates.iter_start(reader)?;
|
||||
let (first_key, first_data) = match iter.next() {
|
||||
Some(result) => result?,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let array = first_key.try_into().unwrap();
|
||||
let number = u64::from_be_bytes(array);
|
||||
|
||||
Ok(Some((number, first_data)))
|
||||
}
|
||||
|
||||
pub fn contains(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
update_id: u64,
|
||||
) -> Result<bool, rkv::StoreError>
|
||||
{
|
||||
let update_id_bytes = update_id.to_be_bytes();
|
||||
self.updates.get(reader, update_id_bytes).map(|v| v.is_some())
|
||||
}
|
||||
|
||||
pub fn put_update(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
update_id: u64,
|
||||
update: &Update,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let update_id_bytes = update_id.to_be_bytes();
|
||||
let update = rmp_serde::to_vec_named(&update)?;
|
||||
let blob = Value::Blob(&update);
|
||||
self.updates.put(writer, update_id_bytes, &blob)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn pop_front(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
) -> MResult<Option<(u64, Update)>>
|
||||
{
|
||||
let (first_id, first_data) = match self.first_update_id(writer)? {
|
||||
Some(entry) => entry,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match first_data {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let update = rmp_serde::from_read_ref(&bytes)?;
|
||||
|
||||
// remove it from the database now
|
||||
let first_id_bytes = first_id.to_be_bytes();
|
||||
self.updates.delete(writer, first_id_bytes)?;
|
||||
|
||||
Ok(Some((first_id, update)))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
67
meilidb-core/src/store/updates_results.rs
Normal file
67
meilidb-core/src/store/updates_results.rs
Normal file
|
@ -0,0 +1,67 @@
|
|||
use std::convert::TryInto;
|
||||
use rkv::Value;
|
||||
use crate::{update::UpdateResult, MResult};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct UpdatesResults {
|
||||
pub(crate) updates_results: rkv::SingleStore,
|
||||
}
|
||||
|
||||
impl UpdatesResults {
|
||||
// TODO we should use the MDB_LAST op but
|
||||
// it is not exposed by the rkv library
|
||||
pub fn last_update_id<'a>(
|
||||
&self,
|
||||
reader: &'a impl rkv::Readable,
|
||||
) -> Result<Option<(u64, Option<Value<'a>>)>, rkv::StoreError>
|
||||
{
|
||||
let mut last = None;
|
||||
let iter = self.updates_results.iter_start(reader)?;
|
||||
for result in iter {
|
||||
let (key, data) = result?;
|
||||
last = Some((key, data));
|
||||
}
|
||||
|
||||
let (last_key, last_data) = match last {
|
||||
Some(entry) => entry,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let array = last_key.try_into().unwrap();
|
||||
let number = u64::from_be_bytes(array);
|
||||
|
||||
Ok(Some((number, last_data)))
|
||||
}
|
||||
|
||||
pub fn put_update_result(
|
||||
&self,
|
||||
writer: &mut rkv::Writer,
|
||||
update_id: u64,
|
||||
update_result: &UpdateResult,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let update_id_bytes = update_id.to_be_bytes();
|
||||
let update_result = bincode::serialize(&update_result)?;
|
||||
let blob = Value::Blob(&update_result);
|
||||
self.updates_results.put(writer, update_id_bytes, &blob)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn update_result(
|
||||
&self,
|
||||
reader: &impl rkv::Readable,
|
||||
update_id: u64,
|
||||
) -> MResult<Option<UpdateResult>>
|
||||
{
|
||||
let update_id_bytes = update_id.to_be_bytes();
|
||||
|
||||
match self.updates_results.get(reader, update_id_bytes)? {
|
||||
Some(Value::Blob(bytes)) => {
|
||||
let update_result = bincode::deserialize(&bytes)?;
|
||||
Ok(Some(update_result))
|
||||
},
|
||||
Some(value) => panic!("invalid type {:?}", value),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
189
meilidb-core/src/update/documents_addition.rs
Normal file
189
meilidb-core/src/update/documents_addition.rs
Normal file
|
@ -0,0 +1,189 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use sdset::{SetOperation, duo::Union};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
|
||||
use crate::store;
|
||||
use crate::update::{Update, next_update_id, apply_documents_deletion};
|
||||
use crate::{MResult, Error, RankedMap};
|
||||
|
||||
pub struct DocumentsAddition<D> {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
documents: Vec<D>,
|
||||
}
|
||||
|
||||
impl<D> DocumentsAddition<D> {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> DocumentsAddition<D>
|
||||
{
|
||||
DocumentsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
documents: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document(&mut self, document: D) {
|
||||
self.documents.push(document);
|
||||
}
|
||||
|
||||
pub fn finalize(self, mut writer: rkv::Writer) -> MResult<u64>
|
||||
where D: serde::Serialize
|
||||
{
|
||||
let update_id = push_documents_addition(
|
||||
&mut writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.documents,
|
||||
)?;
|
||||
writer.commit()?;
|
||||
let _ = self.updates_notifier.send(());
|
||||
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<D> Extend<D> for DocumentsAddition<D> {
|
||||
fn extend<T: IntoIterator<Item=D>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_documents_addition<D: serde::Serialize>(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: Vec<D>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let mut values = Vec::with_capacity(addition.len());
|
||||
for add in addition {
|
||||
let vec = rmp_serde::to_vec_named(&add)?;
|
||||
let add = rmp_serde::from_read(&vec[..])?;
|
||||
values.push(add);
|
||||
}
|
||||
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::DocumentsAddition(values);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_documents_addition(
|
||||
writer: &mut rkv::Writer,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
addition: Vec<rmpv::Value>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let mut document_ids = HashSet::new();
|
||||
let mut document_store = RamDocumentStore::new();
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let identifier = schema.identifier_name();
|
||||
|
||||
for document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
// 1. store the document id for future deletion
|
||||
document_ids.insert(document_id);
|
||||
|
||||
// 2. index the document fields in ram stores
|
||||
let serializer = Serializer {
|
||||
schema: &schema,
|
||||
document_store: &mut document_store,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
}
|
||||
|
||||
// 1. remove the previous documents match indexes
|
||||
let documents_to_insert = document_ids.iter().cloned().collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
ranked_map.clone(),
|
||||
documents_to_insert,
|
||||
)?;
|
||||
|
||||
// 2. insert new document attributes in the database
|
||||
for ((id, attr), value) in document_store.into_inner() {
|
||||
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
||||
}
|
||||
|
||||
let indexed = indexer.build();
|
||||
let mut delta_words_builder = SetBuilder::memory();
|
||||
|
||||
for (word, delta_set) in indexed.words_doc_indexes {
|
||||
delta_words_builder.insert(&word).unwrap();
|
||||
|
||||
let set = match postings_lists_store.postings_list(writer, &word)? {
|
||||
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
|
||||
None => delta_set,
|
||||
};
|
||||
|
||||
postings_lists_store.put_postings_list(writer, &word, &set)?;
|
||||
}
|
||||
|
||||
for (id, words) in indexed.docs_words {
|
||||
docs_words_store.put_doc_words(writer, id, &words)?;
|
||||
}
|
||||
|
||||
let delta_words = delta_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
Some(words) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(words.stream())
|
||||
.add(delta_words.stream())
|
||||
.r#union();
|
||||
|
||||
let mut words_builder = SetBuilder::memory();
|
||||
words_builder.extend_stream(op).unwrap();
|
||||
words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => delta_words,
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
|
||||
let inserted_documents_len = document_ids.len() as u64;
|
||||
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
|
||||
|
||||
Ok(())
|
||||
}
|
180
meilidb-core/src/update/documents_deletion.rs
Normal file
180
meilidb-core/src/update/documents_deletion.rs
Normal file
|
@ -0,0 +1,180 @@
|
|||
use std::collections::{HashMap, HashSet, BTreeSet};
|
||||
|
||||
use fst::{SetBuilder, Streamer};
|
||||
use meilidb_schema::Schema;
|
||||
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
|
||||
|
||||
use crate::{DocumentId, RankedMap, MResult, Error};
|
||||
use crate::serde::extract_document_id;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::store;
|
||||
|
||||
pub struct DocumentsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
documents: Vec<DocumentId>,
|
||||
}
|
||||
|
||||
impl DocumentsDeletion {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> DocumentsDeletion
|
||||
{
|
||||
DocumentsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
documents: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
|
||||
self.documents.push(document_id);
|
||||
}
|
||||
|
||||
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
|
||||
where D: serde::Serialize,
|
||||
{
|
||||
let identifier = schema.identifier_name();
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
self.delete_document_by_id(document_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finalize(self, mut writer: rkv::Writer) -> MResult<u64> {
|
||||
let update_id = push_documents_deletion(
|
||||
&mut writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.documents,
|
||||
)?;
|
||||
writer.commit()?;
|
||||
let _ = self.updates_notifier.send(());
|
||||
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Extend<DocumentId> for DocumentsDeletion {
|
||||
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_documents_deletion(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::DocumentsDeletion(deletion);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_documents_deletion(
|
||||
writer: &mut rkv::Writer,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let idset = SetBuf::from_dirty(deletion);
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
// collect the ranked attributes according to the schema
|
||||
let ranked_attrs: Vec<_> = schema.iter()
|
||||
.filter_map(|(_, attr, prop)| {
|
||||
if prop.is_ranked() { Some(attr) } else { None }
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut words_document_ids = HashMap::new();
|
||||
for id in idset {
|
||||
// remove all the ranked attributes from the ranked_map
|
||||
for ranked_attr in &ranked_attrs {
|
||||
ranked_map.remove(id, *ranked_attr);
|
||||
}
|
||||
|
||||
if let Some(words) = docs_words_store.doc_words(writer, id)? {
|
||||
let mut stream = words.stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let word = word.to_vec();
|
||||
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_documents = HashSet::new();
|
||||
let mut removed_words = BTreeSet::new();
|
||||
for (word, document_ids) in words_document_ids {
|
||||
let document_ids = SetBuf::from_dirty(document_ids);
|
||||
|
||||
if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? {
|
||||
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
|
||||
let doc_indexes = op.into_set_buf();
|
||||
|
||||
if !doc_indexes.is_empty() {
|
||||
postings_lists_store.put_postings_list(writer, &word, &doc_indexes)?;
|
||||
} else {
|
||||
postings_lists_store.del_postings_list(writer, &word)?;
|
||||
removed_words.insert(word);
|
||||
}
|
||||
}
|
||||
|
||||
for id in document_ids {
|
||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||
deleted_documents.insert(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let deleted_documents_len = deleted_documents.len() as u64;
|
||||
for id in deleted_documents {
|
||||
docs_words_store.del_doc_words(writer, id)?;
|
||||
}
|
||||
|
||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
Some(words_set) => {
|
||||
let op = fst::set::OpBuilder::new()
|
||||
.add(words_set.stream())
|
||||
.add(removed_words.stream())
|
||||
.difference();
|
||||
|
||||
let mut words_builder = SetBuilder::memory();
|
||||
words_builder.extend_stream(op).unwrap();
|
||||
words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
|
||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||
|
||||
Ok(())
|
||||
}
|
202
meilidb-core/src/update/mod.rs
Normal file
202
meilidb-core/src/update/mod.rs
Normal file
|
@ -0,0 +1,202 @@
|
|||
mod documents_addition;
|
||||
mod documents_deletion;
|
||||
mod schema_update;
|
||||
mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
|
||||
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
|
||||
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
|
||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
|
||||
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
use std::collections::BTreeMap;
|
||||
use std::cmp;
|
||||
|
||||
use log::debug;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
use crate::{store, MResult, DocumentId, RankedMap};
|
||||
use meilidb_schema::Schema;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
SchemaUpdate(Schema),
|
||||
DocumentsAddition(Vec<rmpv::Value>),
|
||||
DocumentsDeletion(Vec<DocumentId>),
|
||||
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
||||
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateType {
|
||||
SchemaUpdate { schema: Schema },
|
||||
DocumentsAddition { number: usize },
|
||||
DocumentsDeletion { number: usize },
|
||||
SynonymsAddition { number: usize },
|
||||
SynonymsDeletion { number: usize },
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct DetailedDuration {
|
||||
pub main: Duration,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct UpdateResult {
|
||||
pub update_id: u64,
|
||||
pub update_type: UpdateType,
|
||||
pub result: Result<(), String>,
|
||||
pub detailed_duration: DetailedDuration,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateStatus {
|
||||
Enqueued,
|
||||
Processed(UpdateResult),
|
||||
Unknown,
|
||||
}
|
||||
|
||||
pub fn update_status<T: rkv::Readable>(
|
||||
reader: &T,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
update_id: u64,
|
||||
) -> MResult<UpdateStatus>
|
||||
{
|
||||
match updates_results_store.update_result(reader, update_id)? {
|
||||
Some(result) => Ok(UpdateStatus::Processed(result)),
|
||||
None => {
|
||||
if updates_store.contains(reader, update_id)? {
|
||||
Ok(UpdateStatus::Enqueued)
|
||||
} else {
|
||||
Ok(UpdateStatus::Unknown)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next_update_id(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let last_update_id = updates_store.last_update_id(writer)?;
|
||||
let last_update_id = last_update_id.map(|(n, _)| n);
|
||||
|
||||
let last_update_results_id = updates_results_store.last_update_id(writer)?;
|
||||
let last_update_results_id = last_update_results_id.map(|(n, _)| n);
|
||||
|
||||
let max_update_id = cmp::max(last_update_id, last_update_results_id);
|
||||
let new_update_id = max_update_id.map_or(0, |n| n + 1);
|
||||
|
||||
Ok(new_update_id)
|
||||
}
|
||||
|
||||
pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Option<UpdateResult>> {
|
||||
let (update_id, update) = match index.updates.pop_front(writer)? {
|
||||
Some(value) => value,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
debug!("Processing update number {}", update_id);
|
||||
|
||||
let (update_type, result, duration) = match update {
|
||||
Update::SchemaUpdate(schema) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SchemaUpdate { schema: schema.clone() };
|
||||
let result = apply_schema_update(writer, index.main, &schema);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
Update::DocumentsAddition(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
|
||||
|
||||
let result = apply_documents_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
ranked_map,
|
||||
documents,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
Update::DocumentsDeletion(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
|
||||
|
||||
let result = apply_documents_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
ranked_map,
|
||||
documents,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
Update::SynonymsAddition(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
|
||||
|
||||
let result = apply_synonyms_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.synonyms,
|
||||
synonyms,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
Update::SynonymsDeletion(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
|
||||
|
||||
let result = apply_synonyms_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.synonyms,
|
||||
synonyms,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
};
|
||||
|
||||
debug!("Processed update number {} {:?} {:?}", update_id, update_type, result);
|
||||
|
||||
let detailed_duration = DetailedDuration { main: duration };
|
||||
let status = UpdateResult {
|
||||
update_id,
|
||||
update_type,
|
||||
result: result.map_err(|e| e.to_string()),
|
||||
detailed_duration,
|
||||
};
|
||||
|
||||
index.updates_results.put_update_result(writer, update_id, &status)?;
|
||||
|
||||
Ok(Some(status))
|
||||
}
|
31
meilidb-core/src/update/schema_update.rs
Normal file
31
meilidb-core/src/update/schema_update.rs
Normal file
|
@ -0,0 +1,31 @@
|
|||
use meilidb_schema::Schema;
|
||||
use crate::{store, error::UnsupportedOperation, MResult};
|
||||
use crate::update::{Update, next_update_id};
|
||||
|
||||
pub fn apply_schema_update(
|
||||
writer: &mut rkv::Writer,
|
||||
main_store: store::Main,
|
||||
new_schema: &Schema,
|
||||
) -> MResult<()>
|
||||
{
|
||||
if let Some(_) = main_store.schema(writer)? {
|
||||
return Err(UnsupportedOperation::SchemaAlreadyExists.into())
|
||||
}
|
||||
|
||||
main_store.put_schema(writer, new_schema)
|
||||
}
|
||||
|
||||
pub fn push_schema_update(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
schema: Schema,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::SchemaUpdate(schema);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
119
meilidb-core/src/update/synonyms_addition.rs
Normal file
119
meilidb-core/src/update/synonyms_addition.rs
Normal file
|
@ -0,0 +1,119 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsAddition {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
synonyms: BTreeMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
impl SynonymsAddition {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> SynonymsAddition
|
||||
{
|
||||
SynonymsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
synonyms: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: IntoIterator<Item=T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||
}
|
||||
|
||||
pub fn finalize(self, mut writer: rkv::Writer) -> MResult<u64> {
|
||||
let update_id = push_synonyms_addition(
|
||||
&mut writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.synonyms,
|
||||
)?;
|
||||
writer.commit()?;
|
||||
let _ = self.updates_notifier.send(());
|
||||
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_synonyms_addition(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::SynonymsAddition(addition);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_synonyms_addition(
|
||||
writer: &mut rkv::Writer,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
|
||||
for (word, alternatives) in addition {
|
||||
synonyms_builder.insert(&word).unwrap();
|
||||
|
||||
let alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut alternatives_builder = SetBuilder::memory();
|
||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
||||
let bytes = alternatives_builder.into_inner().unwrap();
|
||||
fst::Set::from_bytes(bytes).unwrap()
|
||||
};
|
||||
|
||||
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
||||
}
|
||||
|
||||
let delta_synonyms = synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main_store.synonyms_fst(writer)? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.r#union();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => delta_synonyms,
|
||||
};
|
||||
|
||||
main_store.put_synonyms_fst(writer, &synonyms)?;
|
||||
|
||||
Ok(())
|
||||
}
|
162
meilidb-core/src/update/synonyms_deletion.rs
Normal file
162
meilidb-core/src/update/synonyms_deletion.rs
Normal file
|
@ -0,0 +1,162 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
synonyms: BTreeMap<String, Option<Vec<String>>>,
|
||||
}
|
||||
|
||||
impl SynonymsDeletion {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> SynonymsDeletion
|
||||
{
|
||||
SynonymsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
synonyms: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
self.synonyms.insert(synonym, None);
|
||||
}
|
||||
|
||||
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item=T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let value = self.synonyms.entry(synonym).or_insert(None);
|
||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
||||
match value {
|
||||
Some(v) => v.extend(alternatives),
|
||||
None => *value = Some(Vec::from_iter(alternatives)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finalize(self, mut writer: rkv::Writer) -> MResult<u64> {
|
||||
let update_id = push_synonyms_deletion(
|
||||
&mut writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.synonyms,
|
||||
)?;
|
||||
writer.commit()?;
|
||||
let _ = self.updates_notifier.send(());
|
||||
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_synonyms_deletion(
|
||||
writer: &mut rkv::Writer,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::SynonymsDeletion(deletion);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_synonyms_deletion(
|
||||
writer: &mut rkv::Writer,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, alternatives) in deletion {
|
||||
match alternatives {
|
||||
Some(alternatives) => {
|
||||
let prev_alternatives = synonyms_store.synonyms(writer, synonym.as_bytes())?;
|
||||
let prev_alternatives = match prev_alternatives {
|
||||
Some(alternatives) => alternatives,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let delta_alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut builder = SetBuilder::memory();
|
||||
builder.extend_iter(alternatives).unwrap();
|
||||
builder.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(prev_alternatives.stream())
|
||||
.add(delta_alternatives.stream())
|
||||
.difference();
|
||||
|
||||
let (alternatives, empty_alternatives) = {
|
||||
let mut builder = SetBuilder::memory();
|
||||
let len = builder.get_ref().len();
|
||||
builder.extend_stream(op).unwrap();
|
||||
let is_empty = len == builder.get_ref().len();
|
||||
let bytes = builder.into_inner().unwrap();
|
||||
let alternatives = fst::Set::from_bytes(bytes).unwrap();
|
||||
|
||||
(alternatives, is_empty)
|
||||
};
|
||||
|
||||
if empty_alternatives {
|
||||
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
|
||||
} else {
|
||||
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
||||
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let delta_synonyms = delete_whole_synonym_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main_store.synonyms_fst(writer)? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.difference();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main_store.put_synonyms_fst(writer, &synonyms)?;
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue