mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Rename MeiliDB into MeiliSearch
This commit is contained in:
parent
58eaf78dc4
commit
7cc096e0a2
94 changed files with 126 additions and 126 deletions
48
meilisearch-core/src/automaton/dfa.rs
Normal file
48
meilisearch-core/src/automaton/dfa.rs
Normal file
|
@ -0,0 +1,48 @@
|
|||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||
use PrefixSetting::{NoPrefix, Prefix};
|
||||
|
||||
match query.len() {
|
||||
0..=4 => {
|
||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
}
|
||||
5..=8 => {
|
||||
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_prefix_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
||||
}
|
||||
|
||||
pub fn build_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||
}
|
295
meilisearch-core/src/automaton/mod.rs
Normal file
295
meilisearch-core/src/automaton/mod.rs
Normal file
|
@ -0,0 +1,295 @@
|
|||
mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::{cmp, vec};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||
|
||||
use crate::error::MResult;
|
||||
use crate::store;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<AutomatonGroup>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
pub fn new(
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_list_store: store::PostingsLists,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_list_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AutomatonGroup {
|
||||
pub is_phrase_query: bool,
|
||||
pub automatons: Vec<Automaton>,
|
||||
}
|
||||
|
||||
impl AutomatonGroup {
|
||||
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: false,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: true,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
pub ngram: usize,
|
||||
pub query_len: usize,
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
pub query: String,
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
|
||||
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
||||
if !string.contains(is_cjk) {
|
||||
string = deunicode::deunicode_with_tofu(&string, "");
|
||||
}
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
Some(synonym) => synonym,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let mut automaton_index = 0;
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
Automaton::exact(automaton_index, 1, word)
|
||||
} else {
|
||||
Automaton::prefix_exact(automaton_index, 1, word)
|
||||
};
|
||||
automaton_index += 1;
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(AutomatonGroup::normal(original_automatons));
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(
|
||||
query_range.clone(),
|
||||
real_query_index,
|
||||
&synonyms_words,
|
||||
);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
Automaton::exact(automaton_index, n, synonym)
|
||||
} else {
|
||||
Automaton::non_exact(automaton_index, n, synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
if let Some((left, right)) =
|
||||
split_best_frequency(reader, &normalized, postings_lists_store)?
|
||||
{
|
||||
let a = Automaton::exact(automaton_index, 1, left);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
let b = Automaton::exact(automaton_index, 1, right);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_by_key(|group| {
|
||||
let a = group.automatons.first().unwrap();
|
||||
(
|
||||
Reverse(a.is_exact),
|
||||
a.ngram,
|
||||
Reverse(group.automatons.len()),
|
||||
)
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
423
meilisearch-core/src/automaton/query_enhancer.rs
Normal file
423
meilisearch-core/src/automaton/query_enhancer.rs
Normal file
|
@ -0,0 +1,423 @@
|
|||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
use std::ops::Range;
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false;
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original
|
||||
.map(AsRef::as_ref)
|
||||
.eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end {
|
||||
Equal
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
} else {
|
||||
Greater
|
||||
}
|
||||
});
|
||||
|
||||
let n = match element {
|
||||
Ok(n) => n,
|
||||
Err(n) => n,
|
||||
};
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..=query.len()).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder {
|
||||
query,
|
||||
origins,
|
||||
real_to_origin,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where
|
||||
T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin
|
||||
.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) = self
|
||||
.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
}
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
16
meilisearch-core/src/criterion/document_id.rs
Normal file
16
meilisearch-core/src/criterion/document_id.rs
Normal file
|
@ -0,0 +1,16 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"DocumentId"
|
||||
}
|
||||
}
|
132
meilisearch-core/src/criterion/exact.rs
Normal file
132
meilisearch-core/src/criterion/exact.rs
Normal file
|
@ -0,0 +1,132 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use sdset::Set;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(
|
||||
query_index: &[u32],
|
||||
attribute: &[u16],
|
||||
is_exact: &[bool],
|
||||
fields_counts: &Set<(SchemaAttr, u64)>,
|
||||
) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
|
||||
if *is_exact {
|
||||
found_exact = true;
|
||||
let attr = &attribute[index + pos];
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
count += found_exact as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
let attribute = lhs.attribute();
|
||||
let fields_counts = &lhs.fields_counts;
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
let attribute = rhs.attribute();
|
||||
let fields_counts = &rhs.fields_counts;
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Exact"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "souliereres rouge"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[false];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "soulier" }
|
||||
// doc1: { 0. "soulier bleu et blanc" }
|
||||
#[test]
|
||||
fn basic() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
121
meilisearch-core/src/criterion/mod.rs
Normal file
121
meilisearch-core/src/criterion/mod.rs
Normal file
|
@ -0,0 +1,121 @@
|
|||
mod document_id;
|
||||
mod exact;
|
||||
mod number_of_words;
|
||||
mod sort_by_attr;
|
||||
mod sum_of_typos;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod words_proximity;
|
||||
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
pub use self::{
|
||||
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
|
||||
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
|
||||
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
|
||||
words_proximity::WordsProximity,
|
||||
};
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
|
||||
fn name(&self) -> &str;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> CriteriaBuilder<'a> {
|
||||
pub fn new() -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder {
|
||||
inner: Vec::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
self.inner.reserve(additional)
|
||||
}
|
||||
|
||||
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||
where
|
||||
C: Criterion,
|
||||
{
|
||||
self.push(criterion);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||
where
|
||||
C: Criterion,
|
||||
{
|
||||
self.inner.push(Box::new(criterion));
|
||||
}
|
||||
|
||||
pub fn build(self) -> Criteria<'a> {
|
||||
Criteria { inner: self.inner }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Criteria<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> Default for Criteria<'a> {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
.add(NumberOfWords)
|
||||
.add(WordsProximity)
|
||||
.add(SumOfWordsAttribute)
|
||||
.add(SumOfWordsPosition)
|
||||
.add(Exact)
|
||||
.add(DocumentId)
|
||||
.build()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
||||
&self.inner
|
||||
}
|
||||
}
|
31
meilisearch-core/src/criterion/number_of_words.rs
Normal file
31
meilisearch-core/src/criterion/number_of_words.rs
Normal file
|
@ -0,0 +1,31 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group().count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"NumberOfWords"
|
||||
}
|
||||
}
|
130
meilisearch-core/src/criterion/sort_by_attr.rs
Normal file
130
meilisearch-core/src/criterion/sort_by_attr.rs
Normal file
|
@ -0,0 +1,130 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{RankedMap, RawDocument};
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||
///
|
||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||
/// so you must check the [`Ord`] of `Option` implementation.
|
||||
///
|
||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilisearch::rank::criterion::*;
|
||||
///
|
||||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
pub struct SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
attr: SchemaAttr,
|
||||
reversed: bool,
|
||||
}
|
||||
|
||||
impl<'a> SortByAttr<'a> {
|
||||
pub fn lower_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||
}
|
||||
|
||||
pub fn higher_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||
}
|
||||
|
||||
fn new(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
reversed: bool,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
let attr = match schema.attribute(attr_name) {
|
||||
Some(attr) => attr,
|
||||
None => return Err(SortByAttrError::AttributeNotFound),
|
||||
};
|
||||
|
||||
if !schema.props(attr).is_ranked() {
|
||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||
}
|
||||
|
||||
Ok(SortByAttr {
|
||||
ranked_map,
|
||||
attr,
|
||||
reversed,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = self.ranked_map.get(lhs.id, self.attr);
|
||||
let rhs = self.ranked_map.get(rhs.id, self.attr);
|
||||
|
||||
match (lhs, rhs) {
|
||||
(Some(lhs), Some(rhs)) => {
|
||||
let order = lhs.cmp(&rhs);
|
||||
if self.reversed {
|
||||
order.reverse()
|
||||
} else {
|
||||
order
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SortByAttr"
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SortByAttrError {
|
||||
AttributeNotFound,
|
||||
AttributeNotRegisteredForRanking,
|
||||
}
|
||||
|
||||
impl fmt::Display for SortByAttrError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use SortByAttrError::*;
|
||||
match self {
|
||||
AttributeNotFound => f.write_str("attribute not found in the schema"),
|
||||
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SortByAttrError {}
|
116
meilisearch-core/src/criterion/sum_of_typos.rs
Normal file
116
meilisearch-core/src/criterion/sum_of_typos.rs
Normal file
|
@ -0,0 +1,116 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_typos += custom_log10(distance[index]);
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfTypos"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
64
meilisearch-core/src/criterion/sum_of_words_attribute.rs
Normal file
64
meilisearch-core/src/criterion/sum_of_words_attribute.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsAttribute"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
|
||||
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
|
||||
#[test]
|
||||
fn title_vs_description() {
|
||||
let query_index0 = &[0];
|
||||
let attribute0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let attribute1 = &[1];
|
||||
|
||||
let doc0 = sum_matches_attributes(query_index0, attribute0);
|
||||
let doc1 = sum_matches_attributes(query_index1, attribute1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
64
meilisearch-core/src/criterion/sum_of_words_position.rs
Normal file
64
meilisearch-core/src/criterion/sum_of_words_position.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsPosition"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "Botte rouge et soulier noir"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let query_index0 = &[0];
|
||||
let word_index0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let word_index1 = &[3];
|
||||
|
||||
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
|
||||
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
164
meilisearch-core/src/criterion/words_proximity.rs
Normal file
164
meilisearch-core/src/criterion/words_proximity.rs
Normal file
|
@ -0,0 +1,164 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||
if lattr != rattr {
|
||||
return MAX_DISTANCE;
|
||||
}
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(
|
||||
query_index: &[u32],
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u16],
|
||||
) -> u16 {
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut index = 0;
|
||||
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len]
|
||||
.linear_group()
|
||||
.next()
|
||||
.unwrap()
|
||||
.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
|
||||
(rattr, rwi)
|
||||
};
|
||||
|
||||
let mut last = query_index_groups.next().map(|group| {
|
||||
let attr_wi = get_attr_wi(index, group.len());
|
||||
index += group.len();
|
||||
attr_wi
|
||||
});
|
||||
|
||||
// iter by windows of size 2
|
||||
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||
let attr_wi = get_attr_wi(index, rhs.len());
|
||||
proximity += min_proximity(lhs, attr_wi);
|
||||
last = Some(attr_wi);
|
||||
index += rhs.len();
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"WordsProximity"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 0 }
|
||||
// { id: 2, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
17
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 0, attr: 1, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 1, attr_index: 2 }
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
3
|
||||
);
|
||||
}
|
||||
}
|
1011
meilisearch-core/src/database.rs
Normal file
1011
meilisearch-core/src/database.rs
Normal file
File diff suppressed because it is too large
Load diff
103
meilisearch-core/src/distinct_map.rs
Normal file
103
meilisearch-core/src/distinct_map.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
use hashbrown::HashMap;
|
||||
use std::hash::Hash;
|
||||
|
||||
pub struct DistinctMap<K> {
|
||||
inner: HashMap<K, usize>,
|
||||
limit: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq> DistinctMap<K> {
|
||||
pub fn new(limit: usize) -> Self {
|
||||
DistinctMap {
|
||||
inner: HashMap::new(),
|
||||
limit,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BufferedDistinctMap<'a, K> {
|
||||
internal: &'a mut DistinctMap<K>,
|
||||
inner: HashMap<K, usize>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
|
||||
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
|
||||
BufferedDistinctMap {
|
||||
internal,
|
||||
inner: HashMap::new(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&mut self, key: K) -> bool {
|
||||
let internal_seen = self.internal.inner.get(&key).unwrap_or(&0);
|
||||
let inner_seen = self.inner.entry(key).or_insert(0);
|
||||
let seen = *internal_seen + *inner_seen;
|
||||
|
||||
if seen < self.internal.limit {
|
||||
*inner_seen += 1;
|
||||
self.len += 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register_without_key(&mut self) -> bool {
|
||||
self.len += 1;
|
||||
true
|
||||
}
|
||||
|
||||
pub fn transfert_to_internal(&mut self) {
|
||||
for (k, v) in self.inner.drain() {
|
||||
let value = self.internal.inner.entry(k).or_insert(0);
|
||||
*value += v;
|
||||
}
|
||||
|
||||
self.internal.len += self.len;
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.internal.len() + self.len
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn easy_distinct_map() {
|
||||
let mut map = DistinctMap::new(2);
|
||||
let mut buffered = BufferedDistinctMap::new(&mut map);
|
||||
|
||||
for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] {
|
||||
buffered.register(x);
|
||||
}
|
||||
buffered.transfert_to_internal();
|
||||
assert_eq!(map.len(), 8);
|
||||
|
||||
let mut map = DistinctMap::new(2);
|
||||
let mut buffered = BufferedDistinctMap::new(&mut map);
|
||||
assert_eq!(buffered.register(1), true);
|
||||
assert_eq!(buffered.register(1), true);
|
||||
assert_eq!(buffered.register(1), false);
|
||||
assert_eq!(buffered.register(1), false);
|
||||
|
||||
assert_eq!(buffered.register(2), true);
|
||||
assert_eq!(buffered.register(3), true);
|
||||
assert_eq!(buffered.register(2), true);
|
||||
assert_eq!(buffered.register(2), false);
|
||||
|
||||
buffered.transfert_to_internal();
|
||||
assert_eq!(map.len(), 5);
|
||||
}
|
||||
}
|
117
meilisearch-core/src/error.rs
Normal file
117
meilisearch-core/src/error.rs
Normal file
|
@ -0,0 +1,117 @@
|
|||
use crate::serde::{DeserializerError, SerializerError};
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
use std::{error, fmt, io};
|
||||
|
||||
pub type MResult<T> = Result<T, Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
Io(io::Error),
|
||||
IndexAlreadyExists,
|
||||
SchemaDiffer,
|
||||
SchemaMissing,
|
||||
WordIndexMissing,
|
||||
MissingDocumentId,
|
||||
Zlmdb(heed::Error),
|
||||
Fst(fst::Error),
|
||||
SerdeJson(SerdeJsonError),
|
||||
Bincode(bincode::Error),
|
||||
Serializer(SerializerError),
|
||||
Deserializer(DeserializerError),
|
||||
UnsupportedOperation(UnsupportedOperation),
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(error: io::Error) -> Error {
|
||||
Error::Io(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<heed::Error> for Error {
|
||||
fn from(error: heed::Error) -> Error {
|
||||
Error::Zlmdb(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<fst::Error> for Error {
|
||||
fn from(error: fst::Error) -> Error {
|
||||
Error::Fst(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SerdeJsonError> for Error {
|
||||
fn from(error: SerdeJsonError) -> Error {
|
||||
Error::SerdeJson(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<bincode::Error> for Error {
|
||||
fn from(error: bincode::Error) -> Error {
|
||||
Error::Bincode(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SerializerError> for Error {
|
||||
fn from(error: SerializerError) -> Error {
|
||||
Error::Serializer(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeserializerError> for Error {
|
||||
fn from(error: DeserializerError) -> Error {
|
||||
Error::Deserializer(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<UnsupportedOperation> for Error {
|
||||
fn from(op: UnsupportedOperation) -> Error {
|
||||
Error::UnsupportedOperation(op)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use self::Error::*;
|
||||
match self {
|
||||
Io(e) => write!(f, "{}", e),
|
||||
IndexAlreadyExists => write!(f, "index already exists"),
|
||||
SchemaDiffer => write!(f, "schemas differ"),
|
||||
SchemaMissing => write!(f, "this index does not have a schema"),
|
||||
WordIndexMissing => write!(f, "this index does not have a word index"),
|
||||
MissingDocumentId => write!(f, "document id is missing"),
|
||||
Zlmdb(e) => write!(f, "heed error; {}", e),
|
||||
Fst(e) => write!(f, "fst error; {}", e),
|
||||
SerdeJson(e) => write!(f, "serde json error; {}", e),
|
||||
Bincode(e) => write!(f, "bincode error; {}", e),
|
||||
Serializer(e) => write!(f, "serializer error; {}", e),
|
||||
Deserializer(e) => write!(f, "deserializer error; {}", e),
|
||||
UnsupportedOperation(op) => write!(f, "unsupported operation; {}", op),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for Error {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum UnsupportedOperation {
|
||||
SchemaAlreadyExists,
|
||||
CannotUpdateSchemaIdentifier,
|
||||
CannotReorderSchemaAttribute,
|
||||
CanOnlyIntroduceNewSchemaAttributesAtEnd,
|
||||
CannotRemoveSchemaAttribute,
|
||||
}
|
||||
|
||||
impl fmt::Display for UnsupportedOperation {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use self::UnsupportedOperation::*;
|
||||
match self {
|
||||
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
|
||||
CannotUpdateSchemaIdentifier => write!(f, "Cannot update the identifier of a schema"),
|
||||
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
|
||||
CanOnlyIntroduceNewSchemaAttributesAtEnd => {
|
||||
write!(f, "Can only introduce new attributes at end of a schema")
|
||||
}
|
||||
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
|
||||
}
|
||||
}
|
||||
}
|
134
meilisearch-core/src/levenshtein.rs
Normal file
134
meilisearch-core/src/levenshtein.rs
Normal file
|
@ -0,0 +1,134 @@
|
|||
use std::cmp::min;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::{Index, IndexMut};
|
||||
|
||||
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
|
||||
struct N2Array<T> {
|
||||
y_size: usize,
|
||||
buf: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: Clone> N2Array<T> {
|
||||
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
|
||||
N2Array {
|
||||
y_size: y,
|
||||
buf: vec![value; x * y],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize)> for N2Array<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, (x, y): (usize, usize)) -> &T {
|
||||
&self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
|
||||
&mut self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
|
||||
let (n, m) = (source.len(), target.len());
|
||||
|
||||
assert!(
|
||||
n <= m,
|
||||
"the source string must be shorter than the target one"
|
||||
);
|
||||
|
||||
if n == 0 {
|
||||
return (m as u32, 0);
|
||||
}
|
||||
if m == 0 {
|
||||
return (n as u32, 0);
|
||||
}
|
||||
|
||||
if n == m && source == target {
|
||||
return (0, m);
|
||||
}
|
||||
|
||||
let inf = n + m;
|
||||
let mut matrix = N2Array::new(n + 2, m + 2, 0);
|
||||
|
||||
matrix[(0, 0)] = inf;
|
||||
for i in 0..n + 1 {
|
||||
matrix[(i + 1, 0)] = inf;
|
||||
matrix[(i + 1, 1)] = i;
|
||||
}
|
||||
for j in 0..m + 1 {
|
||||
matrix[(0, j + 1)] = inf;
|
||||
matrix[(1, j + 1)] = j;
|
||||
}
|
||||
|
||||
let mut last_row = BTreeMap::new();
|
||||
|
||||
for (row, char_s) in source.iter().enumerate() {
|
||||
let mut last_match_col = 0;
|
||||
let row = row + 1;
|
||||
|
||||
for (col, char_t) in target.iter().enumerate() {
|
||||
let col = col + 1;
|
||||
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
|
||||
let cost = if char_s == char_t { 0 } else { 1 };
|
||||
|
||||
let dist_add = matrix[(row, col + 1)] + 1;
|
||||
let dist_del = matrix[(row + 1, col)] + 1;
|
||||
let dist_sub = matrix[(row, col)] + cost;
|
||||
let dist_trans = matrix[(last_match_row, last_match_col)]
|
||||
+ (row - last_match_row - 1)
|
||||
+ 1
|
||||
+ (col - last_match_col - 1);
|
||||
|
||||
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
|
||||
|
||||
matrix[(row + 1, col + 1)] = dist;
|
||||
|
||||
if cost == 0 {
|
||||
last_match_col = col;
|
||||
}
|
||||
}
|
||||
|
||||
last_row.insert(char_s, row);
|
||||
}
|
||||
|
||||
let mut minimum = (u32::max_value(), 0);
|
||||
|
||||
for x in n..=m {
|
||||
let dist = matrix[(n + 1, x + 1)] as u32;
|
||||
if dist < minimum.0 {
|
||||
minimum = (dist, x)
|
||||
}
|
||||
}
|
||||
|
||||
minimum
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matched_length() {
|
||||
let query = "Levenste";
|
||||
let text = "Levenshtein";
|
||||
|
||||
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
|
||||
assert_eq!(dist, 1);
|
||||
assert_eq!(&text[..length], "Levenshte");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn matched_length_panic() {
|
||||
let query = "Levenshtein";
|
||||
let text = "Levenste";
|
||||
|
||||
// this function will panic if source if longer than target
|
||||
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
|
||||
}
|
||||
}
|
97
meilisearch-core/src/lib.rs
Normal file
97
meilisearch-core/src/lib.rs
Normal file
|
@ -0,0 +1,97 @@
|
|||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate assert_matches;
|
||||
|
||||
mod automaton;
|
||||
pub mod criterion;
|
||||
mod database;
|
||||
mod distinct_map;
|
||||
mod error;
|
||||
mod levenshtein;
|
||||
mod number;
|
||||
mod query_builder;
|
||||
mod ranked_map;
|
||||
mod raw_document;
|
||||
pub mod raw_indexer;
|
||||
mod reordered_attrs;
|
||||
pub mod serde;
|
||||
pub mod store;
|
||||
mod update;
|
||||
|
||||
pub use self::database::{BoxUpdateFn, Database};
|
||||
pub use self::error::{Error, MResult};
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
pub use self::ranked_map::RankedMap;
|
||||
pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Index;
|
||||
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
||||
|
||||
#[doc(hidden)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct TmpMatch {
|
||||
pub query_index: u32,
|
||||
pub distance: u8,
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
pub is_exact: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
pub highlights: Vec<Highlight>,
|
||||
|
||||
#[cfg(test)]
|
||||
pub matches: Vec<TmpMatch>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
#[cfg(not(test))]
|
||||
fn from_raw(raw: RawDocument) -> Document {
|
||||
Document {
|
||||
id: raw.id,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn from_raw(raw: RawDocument) -> Document {
|
||||
let len = raw.query_index().len();
|
||||
let mut matches = Vec::with_capacity(len);
|
||||
|
||||
let query_index = raw.query_index();
|
||||
let distance = raw.distance();
|
||||
let attribute = raw.attribute();
|
||||
let word_index = raw.word_index();
|
||||
let is_exact = raw.is_exact();
|
||||
|
||||
for i in 0..len {
|
||||
let match_ = TmpMatch {
|
||||
query_index: query_index[i],
|
||||
distance: distance[i],
|
||||
attribute: attribute[i],
|
||||
word_index: word_index[i],
|
||||
is_exact: is_exact[i],
|
||||
};
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document {
|
||||
id: raw.id,
|
||||
matches,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
||||
}
|
||||
}
|
110
meilisearch-core/src/number.rs
Normal file
110
meilisearch-core/src/number.rs
Normal file
|
@ -0,0 +1,110 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::str::FromStr;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Hash)]
|
||||
pub enum Number {
|
||||
Unsigned(u64),
|
||||
Signed(i64),
|
||||
Float(OrderedFloat<f64>),
|
||||
}
|
||||
|
||||
impl FromStr for Number {
|
||||
type Err = ParseNumberError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let uint_error = match u64::from_str(s) {
|
||||
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
let int_error = match i64::from_str(s) {
|
||||
Ok(signed) => return Ok(Number::Signed(signed)),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
let float_error = match f64::from_str(s) {
|
||||
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
Err(ParseNumberError {
|
||||
uint_error,
|
||||
int_error,
|
||||
float_error,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Number {
|
||||
fn eq(&self, other: &Number) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Number {}
|
||||
|
||||
impl PartialOrd for Number {
|
||||
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Number {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
use Number::{Float, Signed, Unsigned};
|
||||
|
||||
match (*self, *other) {
|
||||
(Unsigned(a), Unsigned(b)) => a.cmp(&b),
|
||||
(Unsigned(a), Signed(b)) => {
|
||||
if b < 0 {
|
||||
Ordering::Greater
|
||||
} else {
|
||||
a.cmp(&(b as u64))
|
||||
}
|
||||
}
|
||||
(Unsigned(a), Float(b)) => (OrderedFloat(a as f64)).cmp(&b),
|
||||
(Signed(a), Unsigned(b)) => {
|
||||
if a < 0 {
|
||||
Ordering::Less
|
||||
} else {
|
||||
(a as u64).cmp(&b)
|
||||
}
|
||||
}
|
||||
(Signed(a), Signed(b)) => a.cmp(&b),
|
||||
(Signed(a), Float(b)) => OrderedFloat(a as f64).cmp(&b),
|
||||
(Float(a), Unsigned(b)) => a.cmp(&OrderedFloat(b as f64)),
|
||||
(Float(a), Signed(b)) => a.cmp(&OrderedFloat(b as f64)),
|
||||
(Float(a), Float(b)) => a.cmp(&b),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ParseNumberError {
|
||||
uint_error: ParseIntError,
|
||||
int_error: ParseIntError,
|
||||
float_error: ParseFloatError,
|
||||
}
|
||||
|
||||
impl fmt::Display for ParseNumberError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.uint_error == self.int_error {
|
||||
write!(
|
||||
f,
|
||||
"can not parse number: {}, {}",
|
||||
self.uint_error, self.float_error
|
||||
)
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"can not parse number: {}, {}, {}",
|
||||
self.uint_error, self.int_error, self.float_error
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
1825
meilisearch-core/src/query_builder.rs
Normal file
1825
meilisearch-core/src/query_builder.rs
Normal file
File diff suppressed because it is too large
Load diff
398
meilisearch-core/src/query_enhancer.rs
Normal file
398
meilisearch-core/src/query_enhancer.rs
Normal file
|
@ -0,0 +1,398 @@
|
|||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
41
meilisearch-core/src/ranked_map.rs
Normal file
41
meilisearch-core/src/ranked_map.rs
Normal file
|
@ -0,0 +1,41 @@
|
|||
use std::io::{Read, Write};
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{DocumentId, Number};
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
|
||||
|
||||
impl RankedMap {
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
|
||||
self.0.insert((document, attribute), number);
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
|
||||
self.0.remove(&(document, attribute));
|
||||
}
|
||||
|
||||
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
|
||||
self.0.get(&(document, attribute)).cloned()
|
||||
}
|
||||
|
||||
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
|
||||
bincode::deserialize_from(reader).map(RankedMap)
|
||||
}
|
||||
|
||||
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
bincode::serialize_into(writer, &self.0)
|
||||
}
|
||||
}
|
186
meilisearch-core/src/raw_document.rs
Normal file
186
meilisearch-core/src/raw_document.rs
Normal file
|
@ -0,0 +1,186 @@
|
|||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::{DocumentId, Highlight, TmpMatch};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
pub highlights: Vec<Highlight>,
|
||||
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.query_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.word_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RawDocument {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("RawDocument {\r\n")?;
|
||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"query_index",
|
||||
self.query_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"distance",
|
||||
self.distance()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"attribute",
|
||||
self.attribute()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"word_index",
|
||||
self.word_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"is_exact",
|
||||
self.is_exact()
|
||||
))?;
|
||||
f.write_str("}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from(
|
||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||
) -> Vec<RawDocument> {
|
||||
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
|
||||
|
||||
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
|
||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
|
||||
|
||||
let document_id = mgroup[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
||||
let end = start + mgroup.len();
|
||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
|
||||
|
||||
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
||||
matches2.extend_from_slice(mgroup);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges
|
||||
.into_iter()
|
||||
.map(|(id, range, highlights, fields_counts)| {
|
||||
let matches = SharedMatches {
|
||||
range,
|
||||
matches: matches.clone(),
|
||||
};
|
||||
RawDocument {
|
||||
id,
|
||||
matches,
|
||||
highlights,
|
||||
fields_counts,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u16>,
|
||||
is_exact: Vec<bool>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
}
|
||||
}
|
||||
}
|
271
meilisearch-core/src/raw_indexer.rs
Normal file
271
meilisearch-core/src/raw_indexer.rs
Normal file
|
@ -0,0 +1,271 @@
|
|||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use crate::{DocIndex, DocumentId};
|
||||
use deunicode::deunicode_with_tofu;
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
||||
use sdset::SetBuf;
|
||||
|
||||
const WORD_LENGTH_LIMIT: usize = 80;
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
|
||||
pub struct RawIndexer {
|
||||
word_limit: usize, // the maximum number of indexed words
|
||||
stop_words: fst::Set,
|
||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||
}
|
||||
|
||||
pub struct Indexed {
|
||||
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
||||
pub docs_words: HashMap<DocumentId, fst::Set>,
|
||||
}
|
||||
|
||||
impl RawIndexer {
|
||||
pub fn new(stop_words: fst::Set) -> RawIndexer {
|
||||
RawIndexer::with_word_limit(stop_words, 1000)
|
||||
}
|
||||
|
||||
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
|
||||
RawIndexer {
|
||||
word_limit: limit,
|
||||
stop_words,
|
||||
words_doc_indexes: BTreeMap::new(),
|
||||
docs_words: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
|
||||
let mut number_of_words = 0;
|
||||
|
||||
for token in Tokenizer::new(text) {
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
id,
|
||||
attr,
|
||||
self.word_limit,
|
||||
&self.stop_words,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
number_of_words += 1;
|
||||
|
||||
if !must_continue {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
number_of_words
|
||||
}
|
||||
|
||||
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||
where
|
||||
I: IntoIterator<Item = &'a str>,
|
||||
{
|
||||
let iter = iter.into_iter();
|
||||
for token in SeqTokenizer::new(iter) {
|
||||
let must_continue = index_token(
|
||||
token,
|
||||
id,
|
||||
attr,
|
||||
self.word_limit,
|
||||
&self.stop_words,
|
||||
&mut self.words_doc_indexes,
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> Indexed {
|
||||
let words_doc_indexes = self
|
||||
.words_doc_indexes
|
||||
.into_iter()
|
||||
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
||||
.collect();
|
||||
|
||||
let docs_words = self
|
||||
.docs_words
|
||||
.into_iter()
|
||||
.map(|(id, mut words)| {
|
||||
words.sort_unstable();
|
||||
words.dedup();
|
||||
(id, fst::Set::from_iter(words).unwrap())
|
||||
})
|
||||
.collect();
|
||||
|
||||
Indexed {
|
||||
words_doc_indexes,
|
||||
docs_words,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn index_token(
|
||||
token: Token,
|
||||
id: DocumentId,
|
||||
attr: SchemaAttr,
|
||||
word_limit: usize,
|
||||
stop_words: &fst::Set,
|
||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||
) -> bool {
|
||||
if token.word_index >= word_limit {
|
||||
return false;
|
||||
}
|
||||
|
||||
let lower = token.word.to_lowercase();
|
||||
let token = Token {
|
||||
word: &lower,
|
||||
..token
|
||||
};
|
||||
|
||||
if !stop_words.contains(&token.word) {
|
||||
match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word);
|
||||
|
||||
if word.len() <= WORD_LENGTH_LIMIT {
|
||||
words_doc_indexes
|
||||
.entry(word.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(docindex);
|
||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||
|
||||
if !lower.contains(is_cjk) {
|
||||
let unidecoded = deunicode_with_tofu(&lower, "");
|
||||
if unidecoded != lower && !unidecoded.is_empty() {
|
||||
let word = Vec::from(unidecoded);
|
||||
if word.len() <= WORD_LENGTH_LIMIT {
|
||||
words_doc_indexes
|
||||
.entry(word.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(docindex);
|
||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => return false,
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
|
||||
let word_index = u16::try_from(token.word_index).ok()?;
|
||||
let char_index = u16::try_from(token.char_index).ok()?;
|
||||
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
||||
|
||||
let docindex = DocIndex {
|
||||
document_id: id,
|
||||
attribute: attr.0,
|
||||
word_index,
|
||||
char_index,
|
||||
char_length,
|
||||
};
|
||||
|
||||
Some(docindex)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn strange_apostrophe() {
|
||||
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||
indexer.index_text(docid, attr, text);
|
||||
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strange_apostrophe_in_sequence() {
|
||||
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
||||
indexer.index_text_seq(docid, attr, text);
|
||||
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic_stop_words() {
|
||||
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
|
||||
let stop_words = fst::Set::from_iter(stop_words).unwrap();
|
||||
|
||||
let mut indexer = RawIndexer::new(stop_words);
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||
indexer.index_text(docid, attr, text);
|
||||
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_empty_unidecode() {
|
||||
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||
|
||||
let docid = DocumentId(0);
|
||||
let attr = SchemaAttr(0);
|
||||
let text = "🇯🇵";
|
||||
indexer.index_text(docid, attr, text);
|
||||
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes
|
||||
.get(&"🇯🇵".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
}
|
27
meilisearch-core/src/reordered_attrs.rs
Normal file
27
meilisearch-core/src/reordered_attrs.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
#[derive(Default, Clone)]
|
||||
pub struct ReorderedAttrs {
|
||||
count: usize,
|
||||
reorders: Vec<Option<u16>>,
|
||||
}
|
||||
|
||||
impl ReorderedAttrs {
|
||||
pub fn new() -> ReorderedAttrs {
|
||||
ReorderedAttrs {
|
||||
count: 0,
|
||||
reorders: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_attribute(&mut self, attribute: u16) {
|
||||
self.reorders.resize(attribute as usize + 1, None);
|
||||
self.reorders[attribute as usize] = Some(self.count as u16);
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
pub fn get(&self, attribute: u16) -> Option<u16> {
|
||||
match self.reorders.get(attribute as usize) {
|
||||
Some(Some(attribute)) => Some(*attribute),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
198
meilisearch-core/src/serde/convert_to_number.rs
Normal file
198
meilisearch-core/src/serde/convert_to_number.rs
Normal file
|
@ -0,0 +1,198 @@
|
|||
use std::str::FromStr;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use super::SerializerError;
|
||||
use crate::Number;
|
||||
|
||||
pub struct ConvertToNumber;
|
||||
|
||||
impl ser::Serializer for ConvertToNumber {
|
||||
type Ok = Number;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_char(self, _value: char) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "char" })
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(i64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value))
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(u64::from(value)))
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value))
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(OrderedFloat(f64::from(value))))
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(OrderedFloat(value)))
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::from_str(value)?)
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
258
meilisearch-core/src/serde/convert_to_string.rs
Normal file
258
meilisearch-core/src/serde/convert_to_string.rs
Normal file
|
@ -0,0 +1,258 @@
|
|||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use super::SerializerError;
|
||||
|
||||
pub struct ConvertToString;
|
||||
|
||||
impl ser::Serializer for ConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapConvertToString;
|
||||
type SerializeStruct = StructConvertToString;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "boolean",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapConvertToString {
|
||||
text: String::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Ok(StructConvertToString {
|
||||
text: String::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapConvertToString {
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl ser::SerializeMap for MapConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = key.serialize(ConvertToString)?;
|
||||
self.text.push_str(&text);
|
||||
self.text.push_str(" ");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.text.push_str(&text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.text)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructConvertToString {
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl ser::SerializeStruct for StructConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let value = value.serialize(ConvertToString)?;
|
||||
self.text.push_str(key);
|
||||
self.text.push_str(" ");
|
||||
self.text.push_str(&value);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.text)
|
||||
}
|
||||
}
|
158
meilisearch-core/src/serde/deserializer.rs
Normal file
158
meilisearch-core/src/serde/deserializer.rs
Normal file
|
@ -0,0 +1,158 @@
|
|||
use std::collections::HashSet;
|
||||
use std::io::Cursor;
|
||||
use std::{error::Error, fmt};
|
||||
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
use serde::{de, forward_to_deserialize_any};
|
||||
use serde_json::de::IoRead as SerdeJsonIoRead;
|
||||
use serde_json::Deserializer as SerdeJsonDeserializer;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
|
||||
use crate::store::DocumentsFields;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DeserializerError {
|
||||
SerdeJson(SerdeJsonError),
|
||||
Zlmdb(heed::Error),
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl de::Error for DeserializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
DeserializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeserializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e),
|
||||
DeserializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
|
||||
DeserializerError::Custom(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for DeserializerError {}
|
||||
|
||||
impl From<SerdeJsonError> for DeserializerError {
|
||||
fn from(error: SerdeJsonError) -> DeserializerError {
|
||||
DeserializerError::SerdeJson(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<heed::Error> for DeserializerError {
|
||||
fn from(error: heed::Error) -> DeserializerError {
|
||||
DeserializerError::Zlmdb(error)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Deserializer<'a> {
|
||||
pub document_id: DocumentId,
|
||||
pub reader: &'a heed::RoTxn,
|
||||
pub documents_fields: DocumentsFields,
|
||||
pub schema: &'a Schema,
|
||||
pub attributes: Option<&'a HashSet<SchemaAttr>>,
|
||||
}
|
||||
|
||||
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
||||
type Error = DeserializerError;
|
||||
|
||||
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
self.deserialize_option(visitor)
|
||||
}
|
||||
|
||||
fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
self.deserialize_map(visitor)
|
||||
}
|
||||
|
||||
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
let mut error = None;
|
||||
|
||||
let iter = self
|
||||
.documents_fields
|
||||
.document_fields(self.reader, self.document_id)?
|
||||
.filter_map(|result| {
|
||||
let (attr, value) = match result {
|
||||
Ok(value) => value,
|
||||
Err(e) => {
|
||||
error = Some(e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let is_displayed = self.schema.props(attr).is_displayed();
|
||||
if is_displayed && self.attributes.map_or(true, |f| f.contains(&attr)) {
|
||||
let attribute_name = self.schema.attribute_name(attr);
|
||||
|
||||
let cursor = Cursor::new(value.to_owned());
|
||||
let ioread = SerdeJsonIoRead::new(cursor);
|
||||
let value = Value(SerdeJsonDeserializer::new(ioread));
|
||||
|
||||
Some((attribute_name, value))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let mut iter = iter.peekable();
|
||||
|
||||
let result = match iter.peek() {
|
||||
Some(_) => {
|
||||
let map_deserializer = de::value::MapDeserializer::new(iter);
|
||||
visitor
|
||||
.visit_some(map_deserializer)
|
||||
.map_err(DeserializerError::from)
|
||||
}
|
||||
None => visitor.visit_none(),
|
||||
};
|
||||
|
||||
match error.take() {
|
||||
Some(error) => Err(error.into()),
|
||||
None => result,
|
||||
}
|
||||
}
|
||||
|
||||
forward_to_deserialize_any! {
|
||||
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
|
||||
bytes byte_buf unit unit_struct newtype_struct seq tuple
|
||||
tuple_struct struct enum identifier ignored_any
|
||||
}
|
||||
}
|
||||
|
||||
struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>);
|
||||
|
||||
impl<'de> de::IntoDeserializer<'de, SerdeJsonError> for Value {
|
||||
type Deserializer = Self;
|
||||
|
||||
fn into_deserializer(self) -> Self::Deserializer {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> de::Deserializer<'de> for Value {
|
||||
type Error = SerdeJsonError;
|
||||
|
||||
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
self.0.deserialize_any(visitor)
|
||||
}
|
||||
|
||||
forward_to_deserialize_any! {
|
||||
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
|
||||
bytes byte_buf option unit unit_struct newtype_struct seq tuple
|
||||
tuple_struct map struct enum identifier ignored_any
|
||||
}
|
||||
}
|
295
meilisearch-core/src/serde/extract_document_id.rs
Normal file
295
meilisearch-core/src/serde/extract_document_id.rs
Normal file
|
@ -0,0 +1,295 @@
|
|||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::DocumentId;
|
||||
use serde::{ser, Serialize};
|
||||
use serde_json::Value;
|
||||
use siphasher::sip::SipHasher;
|
||||
|
||||
use super::{ConvertToString, SerializerError};
|
||||
|
||||
pub fn extract_document_id<D>(
|
||||
identifier: &str,
|
||||
document: &D,
|
||||
) -> Result<Option<DocumentId>, SerializerError>
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let serializer = ExtractDocumentId { identifier };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn value_to_string(value: &Value) -> Option<String> {
|
||||
match value {
|
||||
Value::Null => None,
|
||||
Value::Bool(_) => None,
|
||||
Value::Number(value) => Some(value.to_string()),
|
||||
Value::String(value) => Some(value.to_string()),
|
||||
Value::Array(_) => None,
|
||||
Value::Object(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
|
||||
let mut s = SipHasher::new();
|
||||
t.hash(&mut s);
|
||||
let hash = s.finish();
|
||||
DocumentId(hash)
|
||||
}
|
||||
|
||||
struct ExtractDocumentId<'a> {
|
||||
identifier: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
|
||||
type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _value: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
let serializer = ExtractDocumentIdMapSerializer {
|
||||
identifier: self.identifier,
|
||||
document_id: None,
|
||||
current_key_name: None,
|
||||
};
|
||||
|
||||
Ok(serializer)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
let serializer = ExtractDocumentIdStructSerializer {
|
||||
identifier: self.identifier,
|
||||
document_id: None,
|
||||
};
|
||||
|
||||
Ok(serializer)
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractDocumentIdMapSerializer<'a> {
|
||||
identifier: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
K: Serialize,
|
||||
V: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
if self.identifier == key {
|
||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||
match value_to_string(&value).map(|s| compute_document_id(&s)) {
|
||||
Some(document_id) => self.document_id = Some(document_id),
|
||||
None => return Err(SerializerError::InvalidDocumentIdType),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.document_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractDocumentIdStructSerializer<'a> {
|
||||
identifier: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
|
||||
type Ok = Option<DocumentId>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
if self.identifier == key {
|
||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||
match value_to_string(&value).map(compute_document_id) {
|
||||
Some(document_id) => self.document_id = Some(document_id),
|
||||
None => return Err(SerializerError::InvalidDocumentIdType),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.document_id)
|
||||
}
|
||||
}
|
365
meilisearch-core/src/serde/indexer.rs
Normal file
365
meilisearch-core/src/serde/indexer.rs
Normal file
|
@ -0,0 +1,365 @@
|
|||
use meilisearch_schema::SchemaAttr;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use super::{ConvertToString, SerializerError};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Indexer<'a> {
|
||||
pub attribute: SchemaAttr,
|
||||
pub indexer: &'a mut RawIndexer,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for Indexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = SeqIndexer<'a>;
|
||||
type SerializeTuple = TupleIndexer<'a>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapIndexer<'a>;
|
||||
type SerializeStruct = StructIndexer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "boolean",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.serialize_str(&text)
|
||||
}
|
||||
|
||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||
let number_of_words = self
|
||||
.indexer
|
||||
.index_text(self.document_id, self.attribute, text);
|
||||
Ok(Some(number_of_words))
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
let number_of_words = self
|
||||
.indexer
|
||||
.index_text(self.document_id, self.attribute, &text);
|
||||
Ok(Some(number_of_words))
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
let indexer = SeqIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
let indexer = TupleIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
let indexer = MapIndexer {
|
||||
attribute: self.attribute,
|
||||
document_id: self.document_id,
|
||||
indexer: self.indexer,
|
||||
texts: Vec::new(),
|
||||
};
|
||||
|
||||
Ok(indexer)
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SeqIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = key.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key_text = key.to_owned();
|
||||
let value_text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(key_text);
|
||||
self.texts.push(value_text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TupleIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
103
meilisearch-core/src/serde/mod.rs
Normal file
103
meilisearch-core/src/serde/mod.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
mod convert_to_number;
|
||||
mod convert_to_string;
|
||||
mod deserializer;
|
||||
mod extract_document_id;
|
||||
mod indexer;
|
||||
mod serializer;
|
||||
|
||||
pub use self::convert_to_number::ConvertToNumber;
|
||||
pub use self::convert_to_string::ConvertToString;
|
||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
|
||||
pub use self::indexer::Indexer;
|
||||
pub use self::serializer::{serialize_value, Serializer};
|
||||
|
||||
use std::{error::Error, fmt};
|
||||
|
||||
use serde::ser;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
|
||||
use crate::ParseNumberError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
InvalidDocumentIdType,
|
||||
Zlmdb(heed::Error),
|
||||
SerdeJson(SerdeJsonError),
|
||||
ParseNumber(ParseNumberError),
|
||||
UnserializableType { type_name: &'static str },
|
||||
UnindexableType { type_name: &'static str },
|
||||
UnrankableType { type_name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
f.write_str("serialized document does not have an id according to the schema")
|
||||
}
|
||||
SerializerError::InvalidDocumentIdType => {
|
||||
f.write_str("document identifier can only be of type string or number")
|
||||
}
|
||||
SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
|
||||
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
|
||||
SerializerError::ParseNumber(e) => {
|
||||
write!(f, "error while trying to parse a number: {}", e)
|
||||
}
|
||||
SerializerError::UnserializableType { type_name } => {
|
||||
write!(f, "{} is not a serializable type", type_name)
|
||||
}
|
||||
SerializerError::UnindexableType { type_name } => {
|
||||
write!(f, "{} is not an indexable type", type_name)
|
||||
}
|
||||
SerializerError::UnrankableType { type_name } => {
|
||||
write!(f, "{} types can not be used for ranking", type_name)
|
||||
}
|
||||
SerializerError::Custom(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
impl From<String> for SerializerError {
|
||||
fn from(value: String) -> SerializerError {
|
||||
SerializerError::Custom(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SerdeJsonError> for SerializerError {
|
||||
fn from(error: SerdeJsonError) -> SerializerError {
|
||||
SerializerError::SerdeJson(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<heed::Error> for SerializerError {
|
||||
fn from(error: heed::Error) -> SerializerError {
|
||||
SerializerError::Zlmdb(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ParseNumberError> for SerializerError {
|
||||
fn from(error: ParseNumberError) -> SerializerError {
|
||||
SerializerError::ParseNumber(error)
|
||||
}
|
||||
}
|
338
meilisearch-core/src/serde/serializer.rs
Normal file
338
meilisearch-core/src/serde/serializer.rs
Normal file
|
@ -0,0 +1,338 @@
|
|||
use meilisearch_schema::{Schema, SchemaAttr, SchemaProps};
|
||||
use serde::ser;
|
||||
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::store::{DocumentsFields, DocumentsFieldsCounts};
|
||||
use crate::{DocumentId, RankedMap};
|
||||
|
||||
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
|
||||
|
||||
pub struct Serializer<'a, 'b> {
|
||||
pub txn: &'a mut heed::RwTxn<'b>,
|
||||
pub schema: &'a Schema,
|
||||
pub document_store: DocumentsFields,
|
||||
pub document_fields_counts: DocumentsFieldsCounts,
|
||||
pub indexer: &'a mut RawIndexer,
|
||||
pub ranked_map: &'a mut RankedMap,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, 'b>;
|
||||
type SerializeStruct = StructSerializer<'a, 'b>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
txn: self.txn,
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
document_fields_counts: self.document_fields_counts,
|
||||
indexer: self.indexer,
|
||||
ranked_map: self.ranked_map,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Ok(StructSerializer {
|
||||
txn: self.txn,
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
document_fields_counts: self.document_fields_counts,
|
||||
indexer: self.indexer,
|
||||
ranked_map: self.ranked_map,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, 'b> {
|
||||
txn: &'a mut heed::RwTxn<'b>,
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: DocumentsFields,
|
||||
document_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
K: ser::Serialize,
|
||||
V: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
match self.schema.attribute(&key) {
|
||||
Some(attribute) => serialize_value(
|
||||
self.txn,
|
||||
attribute,
|
||||
self.schema.props(attribute),
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
value,
|
||||
),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, 'b> {
|
||||
txn: &'a mut heed::RwTxn<'b>,
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: DocumentsFields,
|
||||
document_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
}
|
||||
|
||||
impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
match self.schema.attribute(key) {
|
||||
Some(attribute) => serialize_value(
|
||||
self.txn,
|
||||
attribute,
|
||||
self.schema.props(attribute),
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
value,
|
||||
),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize_value<T: ?Sized>(
|
||||
txn: &mut heed::RwTxn,
|
||||
attribute: SchemaAttr,
|
||||
props: SchemaProps,
|
||||
document_id: DocumentId,
|
||||
document_store: DocumentsFields,
|
||||
documents_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &mut RawIndexer,
|
||||
ranked_map: &mut RankedMap,
|
||||
value: &T,
|
||||
) -> Result<(), SerializerError>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let serialized = serde_json::to_vec(value)?;
|
||||
document_store.put_document_field(txn, document_id, attribute, &serialized)?;
|
||||
|
||||
if props.is_indexed() {
|
||||
let indexer = Indexer {
|
||||
attribute,
|
||||
indexer,
|
||||
document_id,
|
||||
};
|
||||
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||
documents_fields_counts.put_document_field_count(
|
||||
txn,
|
||||
document_id,
|
||||
attribute,
|
||||
number_of_words as u64,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
if props.is_ranked() {
|
||||
let number = value.serialize(ConvertToNumber)?;
|
||||
ranked_map.insert(document_id, attribute, number);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
49
meilisearch-core/src/store/docs_words.rs
Normal file
49
meilisearch-core/src/store/docs_words.rs
Normal file
|
@ -0,0 +1,49 @@
|
|||
use super::BEU64;
|
||||
use crate::DocumentId;
|
||||
use heed::types::{ByteSlice, OwnedType};
|
||||
use heed::Result as ZResult;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocsWords {
|
||||
pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
|
||||
}
|
||||
|
||||
impl DocsWords {
|
||||
pub fn put_doc_words(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
document_id: DocumentId,
|
||||
words: &fst::Set,
|
||||
) -> ZResult<()> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
let bytes = words.as_fst().as_bytes();
|
||||
self.docs_words.put(writer, &document_id, bytes)
|
||||
}
|
||||
|
||||
pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult<bool> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
self.docs_words.delete(writer, &document_id)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.docs_words.clear(writer)
|
||||
}
|
||||
|
||||
pub fn doc_words(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<Option<fst::Set>> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
match self.docs_words.get(reader, &document_id)? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_owned());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
78
meilisearch-core/src/store/documents_fields.rs
Normal file
78
meilisearch-core/src/store/documents_fields.rs
Normal file
|
@ -0,0 +1,78 @@
|
|||
use heed::types::{ByteSlice, OwnedType};
|
||||
use heed::Result as ZResult;
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
|
||||
use super::DocumentAttrKey;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFields {
|
||||
pub(crate) documents_fields: heed::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
|
||||
}
|
||||
|
||||
impl DocumentsFields {
|
||||
pub fn put_document_field(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: &[u8],
|
||||
) -> ZResult<()> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields.put(writer, &key, value)
|
||||
}
|
||||
|
||||
pub fn del_all_document_fields(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<usize> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
self.documents_fields.delete_range(writer, &(start..=end))
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_attribute<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> ZResult<Option<&'txn [u8]>> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields.get(reader, &key)
|
||||
}
|
||||
|
||||
pub fn document_fields<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<DocumentFieldsIter<'txn>> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
let iter = self.documents_fields.range(reader, &(start..=end))?;
|
||||
Ok(DocumentFieldsIter { iter })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentFieldsIter<'txn> {
|
||||
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
|
||||
}
|
||||
|
||||
impl<'txn> Iterator for DocumentFieldsIter<'txn> {
|
||||
type Item = ZResult<(SchemaAttr, &'txn [u8])>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok((key, bytes))) => {
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((attr, bytes)))
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
142
meilisearch-core/src/store/documents_fields_counts.rs
Normal file
142
meilisearch-core/src/store/documents_fields_counts.rs
Normal file
|
@ -0,0 +1,142 @@
|
|||
use super::DocumentAttrKey;
|
||||
use crate::DocumentId;
|
||||
use heed::types::OwnedType;
|
||||
use heed::Result as ZResult;
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFieldsCounts {
|
||||
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
}
|
||||
|
||||
impl DocumentsFieldsCounts {
|
||||
pub fn put_document_field_count(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: u64,
|
||||
) -> ZResult<()> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields_counts.put(writer, &key, &value)
|
||||
}
|
||||
|
||||
pub fn del_all_document_fields_counts(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<usize> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
self.documents_fields_counts
|
||||
.delete_range(writer, &(start..=end))
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields_counts.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_field_count(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> ZResult<Option<u64>> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
match self.documents_fields_counts.get(reader, &key)? {
|
||||
Some(count) => Ok(Some(count)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_fields_counts<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
let iter = self.documents_fields_counts.range(reader, &(start..=end))?;
|
||||
Ok(DocumentFieldsCountsIter { iter })
|
||||
}
|
||||
|
||||
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<DocumentsIdsIter<'txn>> {
|
||||
let iter = self.documents_fields_counts.iter(reader)?;
|
||||
Ok(DocumentsIdsIter {
|
||||
last_seen_id: None,
|
||||
iter,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn all_documents_fields_counts<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
|
||||
let iter = self.documents_fields_counts.iter(reader)?;
|
||||
Ok(AllDocumentsFieldsCountsIter { iter })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentFieldsCountsIter<'txn> {
|
||||
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
}
|
||||
|
||||
impl Iterator for DocumentFieldsCountsIter<'_> {
|
||||
type Item = ZResult<(SchemaAttr, u64)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok((key, count))) => {
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((attr, count)))
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentsIdsIter<'txn> {
|
||||
last_seen_id: Option<DocumentId>,
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
}
|
||||
|
||||
impl Iterator for DocumentsIdsIter<'_> {
|
||||
type Item = ZResult<DocumentId>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
for result in &mut self.iter {
|
||||
match result {
|
||||
Ok((key, _)) => {
|
||||
let document_id = DocumentId(key.docid.get());
|
||||
if Some(document_id) != self.last_seen_id {
|
||||
self.last_seen_id = Some(document_id);
|
||||
return Some(Ok(document_id));
|
||||
}
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AllDocumentsFieldsCountsIter<'txn> {
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
}
|
||||
|
||||
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
|
||||
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok((key, count))) => {
|
||||
let docid = DocumentId(key.docid.get());
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((docid, attr, count)))
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
183
meilisearch-core/src/store/main.rs
Normal file
183
meilisearch-core/src/store/main.rs
Normal file
|
@ -0,0 +1,183 @@
|
|||
use crate::RankedMap;
|
||||
use chrono::{DateTime, Utc};
|
||||
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
|
||||
use heed::Result as ZResult;
|
||||
use meilisearch_schema::Schema;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
const CREATED_AT_KEY: &str = "created-at";
|
||||
const CUSTOMS_KEY: &str = "customs-key";
|
||||
const FIELDS_FREQUENCY_KEY: &str = "fields-frequency";
|
||||
const NAME_KEY: &str = "name";
|
||||
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||
const SCHEMA_KEY: &str = "schema";
|
||||
const STOP_WORDS_KEY: &str = "stop-words";
|
||||
const SYNONYMS_KEY: &str = "synonyms";
|
||||
const UPDATED_AT_KEY: &str = "updated-at";
|
||||
const WORDS_KEY: &str = "words";
|
||||
|
||||
pub type FreqsMap = HashMap<String, usize>;
|
||||
type SerdeFreqsMap = SerdeBincode<FreqsMap>;
|
||||
type SerdeDatetime = SerdeBincode<DateTime<Utc>>;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Main {
|
||||
pub(crate) main: heed::PolyDatabase,
|
||||
}
|
||||
|
||||
impl Main {
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.main.clear(writer)
|
||||
}
|
||||
|
||||
pub fn put_name(self, writer: &mut heed::RwTxn, name: &str) -> ZResult<()> {
|
||||
self.main.put::<Str, Str>(writer, NAME_KEY, name)
|
||||
}
|
||||
|
||||
pub fn name(self, reader: &heed::RoTxn) -> ZResult<Option<String>> {
|
||||
Ok(self
|
||||
.main
|
||||
.get::<Str, Str>(reader, NAME_KEY)?
|
||||
.map(|name| name.to_owned()))
|
||||
}
|
||||
|
||||
pub fn put_created_at(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, SerdeDatetime>(writer, CREATED_AT_KEY, &Utc::now())
|
||||
}
|
||||
|
||||
pub fn created_at(self, reader: &heed::RoTxn) -> ZResult<Option<DateTime<Utc>>> {
|
||||
self.main.get::<Str, SerdeDatetime>(reader, CREATED_AT_KEY)
|
||||
}
|
||||
|
||||
pub fn put_updated_at(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, SerdeDatetime>(writer, UPDATED_AT_KEY, &Utc::now())
|
||||
}
|
||||
|
||||
pub fn updated_at(self, reader: &heed::RoTxn) -> ZResult<Option<DateTime<Utc>>> {
|
||||
self.main.get::<Str, SerdeDatetime>(reader, UPDATED_AT_KEY)
|
||||
}
|
||||
|
||||
pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||
let bytes = fst.as_fst().as_bytes();
|
||||
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
|
||||
}
|
||||
|
||||
pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
|
||||
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_owned());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_schema(self, writer: &mut heed::RwTxn, schema: &Schema) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
|
||||
}
|
||||
|
||||
pub fn schema(self, reader: &heed::RoTxn) -> ZResult<Option<Schema>> {
|
||||
self.main
|
||||
.get::<Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
|
||||
}
|
||||
|
||||
pub fn put_ranked_map(self, writer: &mut heed::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
|
||||
}
|
||||
|
||||
pub fn ranked_map(self, reader: &heed::RoTxn) -> ZResult<Option<RankedMap>> {
|
||||
self.main
|
||||
.get::<Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
|
||||
}
|
||||
|
||||
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||
let bytes = fst.as_fst().as_bytes();
|
||||
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
|
||||
}
|
||||
|
||||
pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
|
||||
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_owned());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||
let bytes = fst.as_fst().as_bytes();
|
||||
self.main
|
||||
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
|
||||
}
|
||||
|
||||
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
|
||||
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_owned());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
|
||||
where
|
||||
F: Fn(u64) -> u64,
|
||||
{
|
||||
let new = self.number_of_documents(writer).map(f)?;
|
||||
self.main
|
||||
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
|
||||
Ok(new)
|
||||
}
|
||||
|
||||
pub fn number_of_documents(self, reader: &heed::RoTxn) -> ZResult<u64> {
|
||||
match self
|
||||
.main
|
||||
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
|
||||
{
|
||||
Some(value) => Ok(value),
|
||||
None => Ok(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_fields_frequency(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
fields_frequency: &FreqsMap,
|
||||
) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, SerdeFreqsMap>(writer, FIELDS_FREQUENCY_KEY, fields_frequency)
|
||||
}
|
||||
|
||||
pub fn fields_frequency(&self, reader: &heed::RoTxn) -> ZResult<Option<FreqsMap>> {
|
||||
match self
|
||||
.main
|
||||
.get::<Str, SerdeFreqsMap>(reader, FIELDS_FREQUENCY_KEY)?
|
||||
{
|
||||
Some(freqs) => Ok(Some(freqs)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_customs(self, writer: &mut heed::RwTxn, customs: &[u8]) -> ZResult<()> {
|
||||
self.main
|
||||
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
|
||||
}
|
||||
|
||||
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<Option<&'txn [u8]>> {
|
||||
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
|
||||
}
|
||||
}
|
394
meilisearch-core/src/store/mod.rs
Normal file
394
meilisearch-core/src/store/mod.rs
Normal file
|
@ -0,0 +1,394 @@
|
|||
mod docs_words;
|
||||
mod documents_fields;
|
||||
mod documents_fields_counts;
|
||||
mod main;
|
||||
mod postings_lists;
|
||||
mod synonyms;
|
||||
mod updates;
|
||||
mod updates_results;
|
||||
|
||||
pub use self::docs_words::DocsWords;
|
||||
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||
pub use self::documents_fields_counts::{
|
||||
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
||||
};
|
||||
pub use self::main::Main;
|
||||
pub use self::postings_lists::PostingsLists;
|
||||
pub use self::synonyms::Synonyms;
|
||||
pub use self::updates::Updates;
|
||||
pub use self::updates_results::UpdatesResults;
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use heed::Result as ZResult;
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
use serde::de::{self, Deserialize};
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
use crate::criterion::Criteria;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::serde::Deserializer;
|
||||
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
||||
|
||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||
|
||||
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||
#[repr(C)]
|
||||
pub struct DocumentAttrKey {
|
||||
docid: BEU64,
|
||||
attr: BEU16,
|
||||
}
|
||||
|
||||
impl DocumentAttrKey {
|
||||
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
|
||||
DocumentAttrKey {
|
||||
docid: BEU64::new(docid.0),
|
||||
attr: BEU16::new(attr.0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main_name(name: &str) -> String {
|
||||
format!("store-{}", name)
|
||||
}
|
||||
|
||||
fn postings_lists_name(name: &str) -> String {
|
||||
format!("store-{}-postings-lists", name)
|
||||
}
|
||||
|
||||
fn documents_fields_name(name: &str) -> String {
|
||||
format!("store-{}-documents-fields", name)
|
||||
}
|
||||
|
||||
fn documents_fields_counts_name(name: &str) -> String {
|
||||
format!("store-{}-documents-fields-counts", name)
|
||||
}
|
||||
|
||||
fn synonyms_name(name: &str) -> String {
|
||||
format!("store-{}-synonyms", name)
|
||||
}
|
||||
|
||||
fn docs_words_name(name: &str) -> String {
|
||||
format!("store-{}-docs-words", name)
|
||||
}
|
||||
|
||||
fn updates_name(name: &str) -> String {
|
||||
format!("store-{}-updates", name)
|
||||
}
|
||||
|
||||
fn updates_results_name(name: &str) -> String {
|
||||
format!("store-{}-updates-results", name)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Index {
|
||||
pub main: Main,
|
||||
pub postings_lists: PostingsLists,
|
||||
pub documents_fields: DocumentsFields,
|
||||
pub documents_fields_counts: DocumentsFieldsCounts,
|
||||
pub synonyms: Synonyms,
|
||||
pub docs_words: DocsWords,
|
||||
|
||||
pub updates: Updates,
|
||||
pub updates_results: UpdatesResults,
|
||||
pub(crate) updates_notifier: UpdateEventsEmitter,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn document<T: de::DeserializeOwned>(
|
||||
&self,
|
||||
reader: &heed::RoTxn,
|
||||
attributes: Option<&HashSet<&str>>,
|
||||
document_id: DocumentId,
|
||||
) -> MResult<Option<T>> {
|
||||
let schema = self.main.schema(reader)?;
|
||||
let schema = schema.ok_or(Error::SchemaMissing)?;
|
||||
|
||||
let attributes = match attributes {
|
||||
Some(attributes) => attributes
|
||||
.iter()
|
||||
.map(|name| schema.attribute(name))
|
||||
.collect(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut deserializer = Deserializer {
|
||||
document_id,
|
||||
reader,
|
||||
documents_fields: self.documents_fields,
|
||||
schema: &schema,
|
||||
attributes: attributes.as_ref(),
|
||||
};
|
||||
|
||||
Ok(Option::<T>::deserialize(&mut deserializer)?)
|
||||
}
|
||||
|
||||
pub fn document_attribute<T: de::DeserializeOwned>(
|
||||
&self,
|
||||
reader: &heed::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> MResult<Option<T>> {
|
||||
let bytes = self
|
||||
.documents_fields
|
||||
.document_attribute(reader, document_id, attribute)?;
|
||||
match bytes {
|
||||
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn schema_update(&self, writer: &mut heed::RwTxn, schema: Schema) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
update::push_schema_update(writer, self.updates, self.updates_results, schema)
|
||||
}
|
||||
|
||||
pub fn customs_update(&self, writer: &mut heed::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
update::push_customs_update(writer, self.updates, self.updates_results, customs)
|
||||
}
|
||||
|
||||
pub fn documents_addition<D>(&self) -> update::DocumentsAddition<D> {
|
||||
update::DocumentsAddition::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn documents_partial_addition<D>(&self) -> update::DocumentsAddition<D> {
|
||||
update::DocumentsAddition::new_partial(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn documents_deletion(&self) -> update::DocumentsDeletion {
|
||||
update::DocumentsDeletion::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
update::push_clear_all(writer, self.updates, self.updates_results)
|
||||
}
|
||||
|
||||
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
|
||||
update::SynonymsAddition::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn synonyms_deletion(&self) -> update::SynonymsDeletion {
|
||||
update::SynonymsDeletion::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
|
||||
update::StopWordsAddition::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
|
||||
update::StopWordsDeletion::new(
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
self.updates_notifier.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
|
||||
match self.updates.last_update_id(reader)? {
|
||||
Some((id, _)) => Ok(Some(id)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_status(
|
||||
&self,
|
||||
reader: &heed::RoTxn,
|
||||
update_id: u64,
|
||||
) -> MResult<Option<update::UpdateStatus>> {
|
||||
update::update_status(reader, self.updates, self.updates_results, update_id)
|
||||
}
|
||||
|
||||
pub fn all_updates_status(&self, reader: &heed::RoTxn) -> MResult<Vec<update::UpdateStatus>> {
|
||||
let mut updates = Vec::new();
|
||||
let mut last_update_result_id = 0;
|
||||
|
||||
// retrieve all updates results
|
||||
if let Some((last_id, _)) = self.updates_results.last_update_id(reader)? {
|
||||
updates.reserve(last_id as usize);
|
||||
|
||||
for id in 0..=last_id {
|
||||
if let Some(update) = self.update_status(reader, id)? {
|
||||
updates.push(update);
|
||||
last_update_result_id = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// retrieve all enqueued updates
|
||||
if let Some((last_id, _)) = self.updates.last_update_id(reader)? {
|
||||
for id in last_update_result_id + 1..=last_id {
|
||||
if let Some(update) = self.update_status(reader, id)? {
|
||||
updates.push(update);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(updates)
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> QueryBuilder {
|
||||
QueryBuilder::new(
|
||||
self.main,
|
||||
self.postings_lists,
|
||||
self.documents_fields_counts,
|
||||
self.synonyms,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn query_builder_with_criteria<'c, 'f, 'd>(
|
||||
&self,
|
||||
criteria: Criteria<'c>,
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder::with_criteria(
|
||||
self.main,
|
||||
self.postings_lists,
|
||||
self.documents_fields_counts,
|
||||
self.synonyms,
|
||||
criteria,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create(
|
||||
env: &heed::Env,
|
||||
name: &str,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> MResult<Index> {
|
||||
// create all the store names
|
||||
let main_name = main_name(name);
|
||||
let postings_lists_name = postings_lists_name(name);
|
||||
let documents_fields_name = documents_fields_name(name);
|
||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||
let synonyms_name = synonyms_name(name);
|
||||
let docs_words_name = docs_words_name(name);
|
||||
let updates_name = updates_name(name);
|
||||
let updates_results_name = updates_results_name(name);
|
||||
|
||||
// open all the stores
|
||||
let main = env.create_poly_database(Some(&main_name))?;
|
||||
let postings_lists = env.create_database(Some(&postings_lists_name))?;
|
||||
let documents_fields = env.create_database(Some(&documents_fields_name))?;
|
||||
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
|
||||
let synonyms = env.create_database(Some(&synonyms_name))?;
|
||||
let docs_words = env.create_database(Some(&docs_words_name))?;
|
||||
let updates = env.create_database(Some(&updates_name))?;
|
||||
let updates_results = env.create_database(Some(&updates_results_name))?;
|
||||
|
||||
Ok(Index {
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
updates: Updates { updates },
|
||||
updates_results: UpdatesResults { updates_results },
|
||||
updates_notifier,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open(
|
||||
env: &heed::Env,
|
||||
name: &str,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> MResult<Option<Index>> {
|
||||
// create all the store names
|
||||
let main_name = main_name(name);
|
||||
let postings_lists_name = postings_lists_name(name);
|
||||
let documents_fields_name = documents_fields_name(name);
|
||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||
let synonyms_name = synonyms_name(name);
|
||||
let docs_words_name = docs_words_name(name);
|
||||
let updates_name = updates_name(name);
|
||||
let updates_results_name = updates_results_name(name);
|
||||
|
||||
// open all the stores
|
||||
let main = match env.open_poly_database(Some(&main_name))? {
|
||||
Some(main) => main,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let postings_lists = match env.open_database(Some(&postings_lists_name))? {
|
||||
Some(postings_lists) => postings_lists,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let documents_fields = match env.open_database(Some(&documents_fields_name))? {
|
||||
Some(documents_fields) => documents_fields,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let documents_fields_counts = match env.open_database(Some(&documents_fields_counts_name))? {
|
||||
Some(documents_fields_counts) => documents_fields_counts,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let synonyms = match env.open_database(Some(&synonyms_name))? {
|
||||
Some(synonyms) => synonyms,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let docs_words = match env.open_database(Some(&docs_words_name))? {
|
||||
Some(docs_words) => docs_words,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let updates = match env.open_database(Some(&updates_name))? {
|
||||
Some(updates) => updates,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let updates_results = match env.open_database(Some(&updates_results_name))? {
|
||||
Some(updates_results) => updates_results,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
Ok(Some(Index {
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
updates: Updates { updates },
|
||||
updates_results: UpdatesResults { updates_results },
|
||||
updates_notifier,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn clear(writer: &mut heed::RwTxn, index: &Index) -> MResult<()> {
|
||||
// clear all the stores
|
||||
index.main.clear(writer)?;
|
||||
index.postings_lists.clear(writer)?;
|
||||
index.documents_fields.clear(writer)?;
|
||||
index.documents_fields_counts.clear(writer)?;
|
||||
index.synonyms.clear(writer)?;
|
||||
index.docs_words.clear(writer)?;
|
||||
index.updates.clear(writer)?;
|
||||
index.updates_results.clear(writer)?;
|
||||
Ok(())
|
||||
}
|
41
meilisearch-core/src/store/postings_lists.rs
Normal file
41
meilisearch-core/src/store/postings_lists.rs
Normal file
|
@ -0,0 +1,41 @@
|
|||
use crate::DocIndex;
|
||||
use heed::types::{ByteSlice, CowSlice};
|
||||
use heed::Result as ZResult;
|
||||
use sdset::{Set, SetBuf};
|
||||
use std::borrow::Cow;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PostingsLists {
|
||||
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
|
||||
}
|
||||
|
||||
impl PostingsLists {
|
||||
pub fn put_postings_list(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
word: &[u8],
|
||||
words_indexes: &Set<DocIndex>,
|
||||
) -> ZResult<()> {
|
||||
self.postings_lists.put(writer, word, words_indexes)
|
||||
}
|
||||
|
||||
pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
|
||||
self.postings_lists.delete(writer, word)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.postings_lists.clear(writer)
|
||||
}
|
||||
|
||||
pub fn postings_list<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
word: &[u8],
|
||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
||||
match self.postings_lists.get(reader, word)? {
|
||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
40
meilisearch-core/src/store/synonyms.rs
Normal file
40
meilisearch-core/src/store/synonyms.rs
Normal file
|
@ -0,0 +1,40 @@
|
|||
use heed::types::ByteSlice;
|
||||
use heed::Result as ZResult;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Synonyms {
|
||||
pub(crate) synonyms: heed::Database<ByteSlice, ByteSlice>,
|
||||
}
|
||||
|
||||
impl Synonyms {
|
||||
pub fn put_synonyms(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
word: &[u8],
|
||||
synonyms: &fst::Set,
|
||||
) -> ZResult<()> {
|
||||
let bytes = synonyms.as_fst().as_bytes();
|
||||
self.synonyms.put(writer, word, bytes)
|
||||
}
|
||||
|
||||
pub fn del_synonyms(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
|
||||
self.synonyms.delete(writer, word)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.synonyms.clear(writer)
|
||||
}
|
||||
|
||||
pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
|
||||
match self.synonyms.get(reader, word)? {
|
||||
Some(bytes) => {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_owned());
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
59
meilisearch-core/src/store/updates.rs
Normal file
59
meilisearch-core/src/store/updates.rs
Normal file
|
@ -0,0 +1,59 @@
|
|||
use super::BEU64;
|
||||
use crate::update::Update;
|
||||
use heed::types::{OwnedType, SerdeJson};
|
||||
use heed::Result as ZResult;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Updates {
|
||||
pub(crate) updates: heed::Database<OwnedType<BEU64>, SerdeJson<Update>>,
|
||||
}
|
||||
|
||||
impl Updates {
|
||||
// TODO do not trigger deserialize if possible
|
||||
pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
|
||||
match self.updates.last(reader)? {
|
||||
Some((key, data)) => Ok(Some((key.get(), data))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO do not trigger deserialize if possible
|
||||
fn first_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
|
||||
match self.updates.first(reader)? {
|
||||
Some((key, data)) => Ok(Some((key.get(), data))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO do not trigger deserialize if possible
|
||||
pub fn get(self, reader: &heed::RoTxn, update_id: u64) -> ZResult<Option<Update>> {
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates.get(reader, &update_id)
|
||||
}
|
||||
|
||||
pub fn put_update(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
update_id: u64,
|
||||
update: &Update,
|
||||
) -> ZResult<()> {
|
||||
// TODO prefer using serde_json?
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates.put(writer, &update_id, update)
|
||||
}
|
||||
|
||||
pub fn pop_front(self, writer: &mut heed::RwTxn) -> ZResult<Option<(u64, Update)>> {
|
||||
match self.first_update_id(writer)? {
|
||||
Some((update_id, update)) => {
|
||||
let key = BEU64::new(update_id);
|
||||
self.updates.delete(writer, &key)?;
|
||||
Ok(Some((update_id, update)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.updates.clear(writer)
|
||||
}
|
||||
}
|
44
meilisearch-core/src/store/updates_results.rs
Normal file
44
meilisearch-core/src/store/updates_results.rs
Normal file
|
@ -0,0 +1,44 @@
|
|||
use super::BEU64;
|
||||
use crate::update::ProcessedUpdateResult;
|
||||
use heed::types::{OwnedType, SerdeJson};
|
||||
use heed::Result as ZResult;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct UpdatesResults {
|
||||
pub(crate) updates_results: heed::Database<OwnedType<BEU64>, SerdeJson<ProcessedUpdateResult>>,
|
||||
}
|
||||
|
||||
impl UpdatesResults {
|
||||
pub fn last_update_id(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
|
||||
match self.updates_results.last(reader)? {
|
||||
Some((key, data)) => Ok(Some((key.get(), data))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_update_result(
|
||||
self,
|
||||
writer: &mut heed::RwTxn,
|
||||
update_id: u64,
|
||||
update_result: &ProcessedUpdateResult,
|
||||
) -> ZResult<()> {
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates_results.put(writer, &update_id, update_result)
|
||||
}
|
||||
|
||||
pub fn update_result(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
update_id: u64,
|
||||
) -> ZResult<Option<ProcessedUpdateResult>> {
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates_results.get(reader, &update_id)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.updates_results.clear(writer)
|
||||
}
|
||||
}
|
33
meilisearch-core/src/update/clear_all.rs
Normal file
33
meilisearch-core/src/update/clear_all.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult, RankedMap};
|
||||
|
||||
pub fn apply_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &RankedMap::default())?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
documents_fields_store.clear(writer)?;
|
||||
documents_fields_counts_store.clear(writer)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn push_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
let update = Update::clear_all();
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
25
meilisearch-core/src/update/customs_update.rs
Normal file
25
meilisearch-core/src/update/customs_update.rs
Normal file
|
@ -0,0 +1,25 @@
|
|||
use crate::store;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use heed::Result as ZResult;
|
||||
|
||||
pub fn apply_customs_update(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
customs: &[u8],
|
||||
) -> ZResult<()> {
|
||||
main_store.put_customs(writer, customs)
|
||||
}
|
||||
|
||||
pub fn push_customs_update(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
customs: Vec<u8>,
|
||||
) -> ZResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::customs(customs);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
410
meilisearch-core/src/update/documents_addition.rs
Normal file
410
meilisearch-core/src/update/documents_addition.rs
Normal file
|
@ -0,0 +1,410 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::{duo::Union, SetOperation};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
|
||||
use crate::store;
|
||||
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
||||
use crate::{Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsAddition<D> {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
documents: Vec<D>,
|
||||
is_partial: bool,
|
||||
}
|
||||
|
||||
impl<D> DocumentsAddition<D> {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> DocumentsAddition<D> {
|
||||
DocumentsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
documents: Vec::new(),
|
||||
is_partial: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_partial(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> DocumentsAddition<D> {
|
||||
DocumentsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
documents: Vec::new(),
|
||||
is_partial: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document(&mut self, document: D) {
|
||||
self.documents.push(document);
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64>
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_documents_addition(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.documents,
|
||||
self.is_partial,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<D> Extend<D> for DocumentsAddition<D> {
|
||||
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_documents_addition<D: serde::Serialize>(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: Vec<D>,
|
||||
is_partial: bool,
|
||||
) -> MResult<u64> {
|
||||
let mut values = Vec::with_capacity(addition.len());
|
||||
for add in addition {
|
||||
let vec = serde_json::to_vec(&add)?;
|
||||
let add = serde_json::from_slice(&vec)?;
|
||||
values.push(add);
|
||||
}
|
||||
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = if is_partial {
|
||||
Update::documents_partial(values)
|
||||
} else {
|
||||
Update::documents_addition(values)
|
||||
};
|
||||
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_documents_addition<'a, 'b>(
|
||||
writer: &'a mut heed::RwTxn<'b>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||
) -> MResult<()> {
|
||||
let mut documents_additions = HashMap::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let identifier = schema.identifier_name();
|
||||
|
||||
// 1. store documents ids for future deletion
|
||||
for document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
documents_additions.insert(document_id, document);
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_additions.len();
|
||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids,
|
||||
)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
// 3. index the documents fields in the stores
|
||||
let mut indexer = RawIndexer::new(stop_words);
|
||||
|
||||
for (document_id, document) in documents_additions {
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
document_store: documents_fields_store,
|
||||
document_fields_counts: documents_fields_counts_store,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
}
|
||||
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn apply_documents_partial_addition<'a, 'b>(
|
||||
writer: &'a mut heed::RwTxn<'b>,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||
) -> MResult<()> {
|
||||
let mut documents_additions = HashMap::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let identifier = schema.identifier_name();
|
||||
|
||||
// 1. store documents ids for future deletion
|
||||
for mut document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
let mut deserializer = Deserializer {
|
||||
document_id,
|
||||
reader: writer,
|
||||
documents_fields: documents_fields_store,
|
||||
schema: &schema,
|
||||
attributes: None,
|
||||
};
|
||||
|
||||
// retrieve the old document and
|
||||
// update the new one with missing keys found in the old one
|
||||
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
|
||||
if let Some(old_document) = result {
|
||||
for (key, value) in old_document {
|
||||
document.entry(key).or_insert(value);
|
||||
}
|
||||
}
|
||||
|
||||
documents_additions.insert(document_id, document);
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_additions.len();
|
||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids,
|
||||
)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
// 3. index the documents fields in the stores
|
||||
let mut indexer = RawIndexer::new(stop_words);
|
||||
|
||||
for (document_id, document) in documents_additions {
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
document_store: documents_fields_store,
|
||||
document_fields_counts: documents_fields_counts_store,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
}
|
||||
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn reindex_all_documents(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let mut ranked_map = RankedMap::default();
|
||||
|
||||
// 1. retrieve all documents ids
|
||||
let mut documents_ids_to_reindex = Vec::new();
|
||||
for result in documents_fields_counts_store.documents_ids(writer)? {
|
||||
let document_id = result?;
|
||||
documents_ids_to_reindex.push(document_id);
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
|
||||
for documents_ids in documents_ids_to_reindex.chunks(100) {
|
||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||
Some(stop_words) => stop_words,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let number_of_inserted_documents = documents_ids.len();
|
||||
let mut indexer = RawIndexer::new(stop_words);
|
||||
let mut ram_store = HashMap::new();
|
||||
|
||||
for document_id in documents_ids {
|
||||
for result in documents_fields_store.document_fields(writer, *document_id)? {
|
||||
let (attr, bytes) = result?;
|
||||
let value: serde_json::Value = serde_json::from_slice(bytes)?;
|
||||
ram_store.insert((document_id, attr), value);
|
||||
}
|
||||
|
||||
for ((docid, attr), value) in ram_store.drain() {
|
||||
serialize_value(
|
||||
writer,
|
||||
attr,
|
||||
schema.props(attr),
|
||||
*docid,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
&mut indexer,
|
||||
&mut ranked_map,
|
||||
&value,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// 4. write the new index in the main store
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
&ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_documents_addition_index(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
ranked_map: &RankedMap,
|
||||
number_of_inserted_documents: usize,
|
||||
indexer: RawIndexer,
|
||||
) -> MResult<()> {
|
||||
let indexed = indexer.build();
|
||||
let mut delta_words_builder = SetBuilder::memory();
|
||||
|
||||
for (word, delta_set) in indexed.words_doc_indexes {
|
||||
delta_words_builder.insert(&word).unwrap();
|
||||
|
||||
let set = match postings_lists_store.postings_list(writer, &word)? {
|
||||
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
|
||||
None => delta_set,
|
||||
};
|
||||
|
||||
postings_lists_store.put_postings_list(writer, &word, &set)?;
|
||||
}
|
||||
|
||||
for (id, words) in indexed.docs_words {
|
||||
docs_words_store.put_doc_words(writer, id, &words)?;
|
||||
}
|
||||
|
||||
let delta_words = delta_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
Some(words) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(words.stream())
|
||||
.add(delta_words.stream())
|
||||
.r#union();
|
||||
|
||||
let mut words_builder = SetBuilder::memory();
|
||||
words_builder.extend_stream(op).unwrap();
|
||||
words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
}
|
||||
None => delta_words,
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||
|
||||
Ok(())
|
||||
}
|
192
meilisearch-core/src/update/documents_deletion.rs
Normal file
192
meilisearch-core/src/update/documents_deletion.rs
Normal file
|
@ -0,0 +1,192 @@
|
|||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
|
||||
use fst::{SetBuilder, Streamer};
|
||||
use meilisearch_schema::Schema;
|
||||
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
|
||||
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::serde::extract_document_id;
|
||||
use crate::store;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{DocumentId, Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
documents: Vec<DocumentId>,
|
||||
}
|
||||
|
||||
impl DocumentsDeletion {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> DocumentsDeletion {
|
||||
DocumentsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
documents: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
|
||||
self.documents.push(document_id);
|
||||
}
|
||||
|
||||
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let identifier = schema.identifier_name();
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
self.delete_document_by_id(document_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_documents_deletion(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.documents,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Extend<DocumentId> for DocumentsDeletion {
|
||||
fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_documents_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::documents_deletion(deletion);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_documents_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<()> {
|
||||
let idset = SetBuf::from_dirty(deletion);
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
// collect the ranked attributes according to the schema
|
||||
let ranked_attrs: Vec<_> = schema
|
||||
.iter()
|
||||
.filter_map(
|
||||
|(_, attr, prop)| {
|
||||
if prop.is_ranked() {
|
||||
Some(attr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect();
|
||||
|
||||
let mut words_document_ids = HashMap::new();
|
||||
for id in idset {
|
||||
// remove all the ranked attributes from the ranked_map
|
||||
for ranked_attr in &ranked_attrs {
|
||||
ranked_map.remove(id, *ranked_attr);
|
||||
}
|
||||
|
||||
if let Some(words) = docs_words_store.doc_words(writer, id)? {
|
||||
let mut stream = words.stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let word = word.to_vec();
|
||||
words_document_ids
|
||||
.entry(word)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_documents = HashSet::new();
|
||||
let mut removed_words = BTreeSet::new();
|
||||
for (word, document_ids) in words_document_ids {
|
||||
let document_ids = SetBuf::from_dirty(document_ids);
|
||||
|
||||
if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? {
|
||||
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
|
||||
let doc_indexes = op.into_set_buf();
|
||||
|
||||
if !doc_indexes.is_empty() {
|
||||
postings_lists_store.put_postings_list(writer, &word, &doc_indexes)?;
|
||||
} else {
|
||||
postings_lists_store.del_postings_list(writer, &word)?;
|
||||
removed_words.insert(word);
|
||||
}
|
||||
}
|
||||
|
||||
for id in document_ids {
|
||||
documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
|
||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||
deleted_documents.insert(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let deleted_documents_len = deleted_documents.len() as u64;
|
||||
for id in deleted_documents {
|
||||
docs_words_store.del_doc_words(writer, id)?;
|
||||
}
|
||||
|
||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||
let words = match main_store.words_fst(writer)? {
|
||||
Some(words_set) => {
|
||||
let op = fst::set::OpBuilder::new()
|
||||
.add(words_set.stream())
|
||||
.add(removed_words.stream())
|
||||
.difference();
|
||||
|
||||
let mut words_builder = SetBuilder::memory();
|
||||
words_builder.extend_stream(op).unwrap();
|
||||
words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
}
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||
|
||||
Ok(())
|
||||
}
|
420
meilisearch-core/src/update/mod.rs
Normal file
420
meilisearch-core/src/update/mod.rs
Normal file
|
@ -0,0 +1,420 @@
|
|||
mod clear_all;
|
||||
mod customs_update;
|
||||
mod documents_addition;
|
||||
mod documents_deletion;
|
||||
mod schema_update;
|
||||
mod stop_words_addition;
|
||||
mod stop_words_deletion;
|
||||
mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
|
||||
pub use self::clear_all::{apply_clear_all, push_clear_all};
|
||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||
pub use self::documents_addition::{
|
||||
apply_documents_addition, apply_documents_partial_addition, DocumentsAddition,
|
||||
};
|
||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
|
||||
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
|
||||
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
||||
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
||||
|
||||
use std::cmp;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap};
|
||||
use std::time::Instant;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use heed::Result as ZResult;
|
||||
use log::debug;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{store, DocumentId, MResult};
|
||||
use meilisearch_schema::Schema;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Update {
|
||||
data: UpdateData,
|
||||
enqueued_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
fn clear_all() -> Update {
|
||||
Update {
|
||||
data: UpdateData::ClearAll,
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn schema(data: Schema) -> Update {
|
||||
Update {
|
||||
data: UpdateData::Schema(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn customs(data: Vec<u8>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::Customs(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn documents_addition(data: Vec<HashMap<String, serde_json::Value>>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::DocumentsAddition(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn documents_partial(data: Vec<HashMap<String, serde_json::Value>>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::DocumentsPartial(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn documents_deletion(data: Vec<DocumentId>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::DocumentsDeletion(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn synonyms_addition(data: BTreeMap<String, Vec<String>>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::SynonymsAddition(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn synonyms_deletion(data: BTreeMap<String, Option<Vec<String>>>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::SynonymsDeletion(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn stop_words_addition(data: BTreeSet<String>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::StopWordsAddition(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn stop_words_deletion(data: BTreeSet<String>) -> Update {
|
||||
Update {
|
||||
data: UpdateData::StopWordsDeletion(data),
|
||||
enqueued_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateData {
|
||||
ClearAll,
|
||||
Schema(Schema),
|
||||
Customs(Vec<u8>),
|
||||
DocumentsAddition(Vec<HashMap<String, serde_json::Value>>),
|
||||
DocumentsPartial(Vec<HashMap<String, serde_json::Value>>),
|
||||
DocumentsDeletion(Vec<DocumentId>),
|
||||
SynonymsAddition(BTreeMap<String, Vec<String>>),
|
||||
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
|
||||
StopWordsAddition(BTreeSet<String>),
|
||||
StopWordsDeletion(BTreeSet<String>),
|
||||
}
|
||||
|
||||
impl UpdateData {
|
||||
pub fn update_type(&self) -> UpdateType {
|
||||
match self {
|
||||
UpdateData::ClearAll => UpdateType::ClearAll,
|
||||
UpdateData::Schema(_) => UpdateType::Schema,
|
||||
UpdateData::Customs(_) => UpdateType::Customs,
|
||||
UpdateData::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
|
||||
number: addition.len(),
|
||||
},
|
||||
UpdateData::DocumentsPartial(addition) => UpdateType::DocumentsPartial {
|
||||
number: addition.len(),
|
||||
},
|
||||
UpdateData::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
|
||||
number: deletion.len(),
|
||||
},
|
||||
UpdateData::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
|
||||
number: addition.len(),
|
||||
},
|
||||
UpdateData::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
|
||||
number: deletion.len(),
|
||||
},
|
||||
UpdateData::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
|
||||
number: addition.len(),
|
||||
},
|
||||
UpdateData::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
|
||||
number: deletion.len(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "name")]
|
||||
pub enum UpdateType {
|
||||
ClearAll,
|
||||
Schema,
|
||||
Customs,
|
||||
DocumentsAddition { number: usize },
|
||||
DocumentsPartial { number: usize },
|
||||
DocumentsDeletion { number: usize },
|
||||
SynonymsAddition { number: usize },
|
||||
SynonymsDeletion { number: usize },
|
||||
StopWordsAddition { number: usize },
|
||||
StopWordsDeletion { number: usize },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessedUpdateResult {
|
||||
pub update_id: u64,
|
||||
#[serde(rename = "type")]
|
||||
pub update_type: UpdateType,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
pub duration: f64, // in seconds
|
||||
pub enqueued_at: DateTime<Utc>,
|
||||
pub processed_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EnqueuedUpdateResult {
|
||||
pub update_id: u64,
|
||||
pub update_type: UpdateType,
|
||||
pub enqueued_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase", tag = "status")]
|
||||
pub enum UpdateStatus {
|
||||
Enqueued {
|
||||
#[serde(flatten)]
|
||||
content: EnqueuedUpdateResult,
|
||||
},
|
||||
Processed {
|
||||
#[serde(flatten)]
|
||||
content: ProcessedUpdateResult,
|
||||
},
|
||||
}
|
||||
|
||||
pub fn update_status(
|
||||
reader: &heed::RoTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
update_id: u64,
|
||||
) -> MResult<Option<UpdateStatus>> {
|
||||
match updates_results_store.update_result(reader, update_id)? {
|
||||
Some(result) => Ok(Some(UpdateStatus::Processed { content: result })),
|
||||
None => match updates_store.get(reader, update_id)? {
|
||||
Some(update) => Ok(Some(UpdateStatus::Enqueued {
|
||||
content: EnqueuedUpdateResult {
|
||||
update_id,
|
||||
update_type: update.data.update_type(),
|
||||
enqueued_at: update.enqueued_at,
|
||||
},
|
||||
})),
|
||||
None => Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next_update_id(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> ZResult<u64> {
|
||||
let last_update_id = updates_store.last_update_id(writer)?;
|
||||
let last_update_id = last_update_id.map(|(n, _)| n);
|
||||
|
||||
let last_update_results_id = updates_results_store.last_update_id(writer)?;
|
||||
let last_update_results_id = last_update_results_id.map(|(n, _)| n);
|
||||
|
||||
let max_update_id = cmp::max(last_update_id, last_update_results_id);
|
||||
let new_update_id = max_update_id.map_or(0, |n| n + 1);
|
||||
|
||||
Ok(new_update_id)
|
||||
}
|
||||
|
||||
pub fn update_task<'a, 'b>(
|
||||
writer: &'a mut heed::RwTxn<'b>,
|
||||
index: store::Index,
|
||||
update_id: u64,
|
||||
update: Update,
|
||||
) -> MResult<ProcessedUpdateResult> {
|
||||
debug!("Processing update number {}", update_id);
|
||||
|
||||
let Update { enqueued_at, data } = update;
|
||||
|
||||
let (update_type, result, duration) = match data {
|
||||
UpdateData::ClearAll => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::ClearAll;
|
||||
let result = apply_clear_all(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::Schema(schema) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::Schema;
|
||||
let result = apply_schema_update(
|
||||
writer,
|
||||
&schema,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::Customs(customs) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::Customs;
|
||||
let result = apply_customs_update(writer, index.main, &customs).map_err(Into::into);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::DocumentsAddition(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::DocumentsAddition {
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::DocumentsPartial(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::DocumentsPartial {
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_partial_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::DocumentsDeletion(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::DocumentsDeletion {
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
documents,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::SynonymsAddition(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsAddition {
|
||||
number: synonyms.len(),
|
||||
};
|
||||
|
||||
let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::SynonymsDeletion(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsDeletion {
|
||||
number: synonyms.len(),
|
||||
};
|
||||
|
||||
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::StopWordsAddition(stop_words) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::StopWordsAddition {
|
||||
number: stop_words.len(),
|
||||
};
|
||||
|
||||
let result =
|
||||
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
UpdateData::StopWordsDeletion(stop_words) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::StopWordsDeletion {
|
||||
number: stop_words.len(),
|
||||
};
|
||||
|
||||
let result = apply_stop_words_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
stop_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Processed update number {} {:?} {:?}",
|
||||
update_id, update_type, result
|
||||
);
|
||||
|
||||
let status = ProcessedUpdateResult {
|
||||
update_id,
|
||||
update_type,
|
||||
error: result.map_err(|e| e.to_string()).err(),
|
||||
duration: duration.as_secs_f64(),
|
||||
enqueued_at,
|
||||
processed_at: Utc::now(),
|
||||
};
|
||||
|
||||
Ok(status)
|
||||
}
|
75
meilisearch-core/src/update/schema_update.rs
Normal file
75
meilisearch-core/src/update/schema_update.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
use meilisearch_schema::{Diff, Schema};
|
||||
|
||||
use crate::update::documents_addition::reindex_all_documents;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{error::UnsupportedOperation, store, MResult};
|
||||
|
||||
pub fn apply_schema_update(
|
||||
writer: &mut heed::RwTxn,
|
||||
new_schema: &Schema,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
use UnsupportedOperation::{
|
||||
CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
|
||||
CannotReorderSchemaAttribute, CannotUpdateSchemaIdentifier,
|
||||
};
|
||||
|
||||
let mut need_full_reindexing = false;
|
||||
|
||||
if let Some(old_schema) = main_store.schema(writer)? {
|
||||
for diff in meilisearch_schema::diff(&old_schema, new_schema) {
|
||||
match diff {
|
||||
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
|
||||
Diff::AttrMove { .. } => return Err(CannotReorderSchemaAttribute.into()),
|
||||
Diff::AttrPropsChange { old, new, .. } => {
|
||||
if new.indexed != old.indexed {
|
||||
need_full_reindexing = true;
|
||||
}
|
||||
if new.ranked != old.ranked {
|
||||
need_full_reindexing = true;
|
||||
}
|
||||
}
|
||||
Diff::NewAttr { pos, .. } => {
|
||||
// new attribute not at the end of the schema
|
||||
if pos < old_schema.number_of_attributes() {
|
||||
return Err(CanOnlyIntroduceNewSchemaAttributesAtEnd.into());
|
||||
}
|
||||
}
|
||||
Diff::RemovedAttr { .. } => return Err(CannotRemoveSchemaAttribute.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main_store.put_schema(writer, new_schema)?;
|
||||
|
||||
if need_full_reindexing {
|
||||
reindex_all_documents(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
)?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn push_schema_update(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
schema: Schema,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::schema(schema);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
117
meilisearch-core/src/update/stop_words_addition.rs
Normal file
117
meilisearch-core/src/update/stop_words_addition.rs
Normal file
|
@ -0,0 +1,117 @@
|
|||
use std::collections::BTreeSet;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct StopWordsAddition {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
stop_words: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl StopWordsAddition {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> StopWordsAddition {
|
||||
StopWordsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
stop_words: BTreeSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
|
||||
let stop_word = normalize_str(stop_word.as_ref());
|
||||
self.stop_words.insert(stop_word);
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_stop_words_addition(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.stop_words,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_stop_words_addition(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: BTreeSet<String>,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::stop_words_addition(addition);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_stop_words_addition(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
addition: BTreeSet<String>,
|
||||
) -> MResult<()> {
|
||||
let mut stop_words_builder = SetBuilder::memory();
|
||||
|
||||
for word in addition {
|
||||
stop_words_builder.insert(&word).unwrap();
|
||||
// we remove every posting list associated to a new stop word
|
||||
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
|
||||
}
|
||||
|
||||
// create the new delta stop words fst
|
||||
let delta_stop_words = stop_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
// we also need to remove all the stop words from the main fst
|
||||
if let Some(word_fst) = main_store.words_fst(writer)? {
|
||||
let op = OpBuilder::new()
|
||||
.add(&word_fst)
|
||||
.add(&delta_stop_words)
|
||||
.difference();
|
||||
|
||||
let mut word_fst_builder = SetBuilder::memory();
|
||||
word_fst_builder.extend_stream(op).unwrap();
|
||||
let word_fst = word_fst_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
main_store.put_words_fst(writer, &word_fst)?;
|
||||
}
|
||||
|
||||
// now we add all of these stop words from the main store
|
||||
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(&stop_words_fst)
|
||||
.add(&delta_stop_words)
|
||||
.r#union();
|
||||
|
||||
let mut stop_words_builder = SetBuilder::memory();
|
||||
stop_words_builder.extend_stream(op).unwrap();
|
||||
let stop_words_fst = stop_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||
|
||||
Ok(())
|
||||
}
|
113
meilisearch-core/src/update/stop_words_deletion.rs
Normal file
113
meilisearch-core/src/update/stop_words_deletion.rs
Normal file
|
@ -0,0 +1,113 @@
|
|||
use std::collections::BTreeSet;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::update::documents_addition::reindex_all_documents;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct StopWordsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
stop_words: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl StopWordsDeletion {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> StopWordsDeletion {
|
||||
StopWordsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
stop_words: BTreeSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
|
||||
let stop_word = normalize_str(stop_word.as_ref());
|
||||
self.stop_words.insert(stop_word);
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_stop_words_deletion(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.stop_words,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_stop_words_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: BTreeSet<String>,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::stop_words_deletion(deletion);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_stop_words_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
deletion: BTreeSet<String>,
|
||||
) -> MResult<()> {
|
||||
let mut stop_words_builder = SetBuilder::memory();
|
||||
|
||||
for word in deletion {
|
||||
stop_words_builder.insert(&word).unwrap();
|
||||
}
|
||||
|
||||
// create the new delta stop words fst
|
||||
let delta_stop_words = stop_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
// now we delete all of these stop words from the main store
|
||||
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(&stop_words_fst)
|
||||
.add(&delta_stop_words)
|
||||
.difference();
|
||||
|
||||
let mut stop_words_builder = SetBuilder::memory();
|
||||
stop_words_builder.extend_stream(op).unwrap();
|
||||
let stop_words_fst = stop_words_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||
|
||||
// now that we have setup the stop words
|
||||
// lets reindex everything...
|
||||
reindex_all_documents(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
119
meilisearch-core/src/update/synonyms_addition.rs
Normal file
119
meilisearch-core/src/update/synonyms_addition.rs
Normal file
|
@ -0,0 +1,119 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsAddition {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
synonyms: BTreeMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
impl SynonymsAddition {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> SynonymsAddition {
|
||||
SynonymsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
synonyms: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: IntoIterator<Item = T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||
self.synonyms
|
||||
.entry(synonym)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(alternatives);
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_synonyms_addition(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.synonyms,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_synonyms_addition(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::synonyms_addition(addition);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_synonyms_addition(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<()> {
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
|
||||
for (word, alternatives) in addition {
|
||||
synonyms_builder.insert(&word).unwrap();
|
||||
|
||||
let alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut alternatives_builder = SetBuilder::memory();
|
||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
||||
let bytes = alternatives_builder.into_inner().unwrap();
|
||||
fst::Set::from_bytes(bytes).unwrap()
|
||||
};
|
||||
|
||||
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
||||
}
|
||||
|
||||
let delta_synonyms = synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main_store.synonyms_fst(writer)? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.r#union();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
}
|
||||
None => delta_synonyms,
|
||||
};
|
||||
|
||||
main_store.put_synonyms_fst(writer, &synonyms)?;
|
||||
|
||||
Ok(())
|
||||
}
|
157
meilisearch-core/src/update/synonyms_deletion.rs
Normal file
157
meilisearch-core/src/update/synonyms_deletion.rs
Normal file
|
@ -0,0 +1,157 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
synonyms: BTreeMap<String, Option<Vec<String>>>,
|
||||
}
|
||||
|
||||
impl SynonymsDeletion {
|
||||
pub fn new(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: UpdateEventsEmitter,
|
||||
) -> SynonymsDeletion {
|
||||
SynonymsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
updates_notifier,
|
||||
synonyms: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
self.synonyms.insert(synonym, None);
|
||||
}
|
||||
|
||||
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item = T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let value = self.synonyms.entry(synonym).or_insert(None);
|
||||
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
|
||||
match value {
|
||||
Some(v) => v.extend(alternatives),
|
||||
None => *value = Some(Vec::from_iter(alternatives)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||
let update_id = push_synonyms_deletion(
|
||||
writer,
|
||||
self.updates_store,
|
||||
self.updates_results_store,
|
||||
self.synonyms,
|
||||
)?;
|
||||
Ok(update_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_synonyms_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::synonyms_deletion(deletion);
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
||||
|
||||
pub fn apply_synonyms_deletion(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<()> {
|
||||
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, alternatives) in deletion {
|
||||
match alternatives {
|
||||
Some(alternatives) => {
|
||||
let prev_alternatives = synonyms_store.synonyms(writer, synonym.as_bytes())?;
|
||||
let prev_alternatives = match prev_alternatives {
|
||||
Some(alternatives) => alternatives,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let delta_alternatives = {
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut builder = SetBuilder::memory();
|
||||
builder.extend_iter(alternatives).unwrap();
|
||||
builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
|
||||
};
|
||||
|
||||
let op = OpBuilder::new()
|
||||
.add(prev_alternatives.stream())
|
||||
.add(delta_alternatives.stream())
|
||||
.difference();
|
||||
|
||||
let (alternatives, empty_alternatives) = {
|
||||
let mut builder = SetBuilder::memory();
|
||||
let len = builder.get_ref().len();
|
||||
builder.extend_stream(op).unwrap();
|
||||
let is_empty = len == builder.get_ref().len();
|
||||
let bytes = builder.into_inner().unwrap();
|
||||
let alternatives = fst::Set::from_bytes(bytes).unwrap();
|
||||
|
||||
(alternatives, is_empty)
|
||||
};
|
||||
|
||||
if empty_alternatives {
|
||||
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
|
||||
} else {
|
||||
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
||||
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let delta_synonyms = delete_whole_synonym_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match main_store.synonyms_fst(writer)? {
|
||||
Some(synonyms) => {
|
||||
let op = OpBuilder::new()
|
||||
.add(synonyms.stream())
|
||||
.add(delta_synonyms.stream())
|
||||
.difference();
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
synonyms_builder.extend_stream(op).unwrap();
|
||||
synonyms_builder
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
}
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
main_store.put_synonyms_fst(writer, &synonyms)?;
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue