From 39e0d9fc4adfd6379bd91b183e843d111a89ae61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 2 Oct 2019 17:34:32 +0200 Subject: [PATCH] Introduce a basically working rkv based MeiliDB --- .gitignore | 4 + Cargo.toml | 26 ++ src/automaton/dfa.rs | 51 +++ src/automaton/mod.rs | 202 ++++++++++++ src/automaton/query_enhancer.rs | 398 ++++++++++++++++++++++++ src/criterion/document_id.rs | 16 + src/criterion/exact.rs | 65 ++++ src/criterion/mod.rs | 120 +++++++ src/criterion/number_of_words.rs | 31 ++ src/criterion/sum_of_typos.rs | 116 +++++++ src/criterion/sum_of_words_attribute.rs | 64 ++++ src/criterion/sum_of_words_position.rs | 64 ++++ src/criterion/words_proximity.rs | 155 +++++++++ src/lib.rs | 135 ++++++++ src/main.rs | 52 ++++ src/query_builder.rs | 275 ++++++++++++++++ src/raw_document.rs | 141 +++++++++ src/raw_indexer.rs | 208 +++++++++++++ src/reordered_attrs.rs | 24 ++ src/store/mod.rs | 26 ++ src/store/synonyms.rs | 23 ++ src/store/words.rs | 91 ++++++ 22 files changed, 2287 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/automaton/dfa.rs create mode 100644 src/automaton/mod.rs create mode 100644 src/automaton/query_enhancer.rs create mode 100644 src/criterion/document_id.rs create mode 100644 src/criterion/exact.rs create mode 100644 src/criterion/mod.rs create mode 100644 src/criterion/number_of_words.rs create mode 100644 src/criterion/sum_of_typos.rs create mode 100644 src/criterion/sum_of_words_attribute.rs create mode 100644 src/criterion/sum_of_words_position.rs create mode 100644 src/criterion/words_proximity.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/query_builder.rs create mode 100644 src/raw_document.rs create mode 100644 src/raw_indexer.rs create mode 100644 src/reordered_attrs.rs create mode 100644 src/store/mod.rs create mode 100644 src/store/synonyms.rs create mode 100644 src/store/words.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..6e03cb642 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +**/*.rs.bk +Cargo.lock +/*.rkv diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..c04eb5170 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "new-meilidb" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] +byteorder = "1.3.2" +deunicode = "1.0.0" +once_cell = "1.2.0" +rkv = "0.10.2" +sdset = "0.3.2" +slice-group-by = "0.2.6" +zerocopy = "0.2.8" + +meilidb-schema = { path = "../MeiliDB/meilidb-schema", version = "0.1.0" } +meilidb-tokenizer = { path = "../MeiliDB/meilidb-tokenizer", version = "0.1.0" } + +[dependencies.levenshtein_automata] +git = "https://github.com/Kerollmops/levenshtein-automata.git" +branch = "arc-byte-slice" +features = ["fst_automaton"] + +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" diff --git a/src/automaton/dfa.rs b/src/automaton/dfa.rs new file mode 100644 index 000000000..015fdd877 --- /dev/null +++ b/src/automaton/dfa.rs @@ -0,0 +1,51 @@ +use once_cell::sync::OnceCell; +use levenshtein_automata::{ + LevenshteinAutomatonBuilder as LevBuilder, + DFA, +}; + +static LEVDIST0: OnceCell = OnceCell::new(); +static LEVDIST1: OnceCell = OnceCell::new(); +static LEVDIST2: OnceCell = OnceCell::new(); + +#[derive(Copy, Clone)] +enum PrefixSetting { + Prefix, + NoPrefix, +} + +fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { + use PrefixSetting::{Prefix, NoPrefix}; + + match query.len() { + 0 ..= 4 => { + let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + 5 ..= 8 => { + let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + _ => { + let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + } +} + +pub fn build_prefix_dfa(query: &str) -> DFA { + build_dfa_with_setting(query, PrefixSetting::Prefix) +} + +pub fn build_dfa(query: &str) -> DFA { + build_dfa_with_setting(query, PrefixSetting::NoPrefix) +} diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs new file mode 100644 index 000000000..8c5c68d98 --- /dev/null +++ b/src/automaton/mod.rs @@ -0,0 +1,202 @@ +mod dfa; +mod query_enhancer; + +use std::cmp::Reverse; +use std::vec; + +use fst::{IntoStreamer, Streamer}; +use levenshtein_automata::DFA; +use meilidb_tokenizer::{split_query_string, is_cjk}; + +use crate::store; + +use self::dfa::{build_dfa, build_prefix_dfa}; +use self::query_enhancer::QueryEnhancerBuilder; +pub use self::query_enhancer::QueryEnhancer; + +const NGRAMS: usize = 3; + +pub struct AutomatonProducer { + automatons: Vec>, +} + +impl AutomatonProducer { + pub fn new( + reader: &rkv::Reader, + query: &str, + synonyms_store: store::Synonyms, + ) -> (AutomatonProducer, QueryEnhancer) + { + let (automatons, query_enhancer) = generate_automatons(reader, query, synonyms_store).unwrap(); + (AutomatonProducer { automatons }, query_enhancer) + } + + pub fn into_iter(self) -> vec::IntoIter> { + self.automatons.into_iter() + } +} + +pub struct Automaton { + pub index: usize, + pub ngram: usize, + pub query_len: usize, + pub is_exact: bool, + pub is_prefix: bool, + pub query: String, +} + +impl Automaton { + pub fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } + + fn exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: true, + is_prefix: false, + query: query.to_string(), + } + } + + fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: true, + is_prefix: true, + query: query.to_string(), + } + } + + fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: false, + is_prefix: false, + query: query.to_string(), + } + } +} + +pub fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string +} + +fn generate_automatons( + reader: &rkv::Reader, + query: &str, + synonym_store: store::Synonyms, +) -> Result<(Vec>, QueryEnhancer), rkv::StoreError> +{ + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let synonyms = synonym_store.synonyms_fst(reader)?; + + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); + + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_automatons = Vec::new(); + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + Automaton::exact(automatons.len(), 1, word) + } else { + Automaton::prefix_exact(automatons.len(), 1, word) + }; + original_automatons.push(automaton); + } + + automatons.push(original_automatons); + + for n in 1..=NGRAMS { + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + + let query_range = query_index..query_index + n; + let ngram_nb_words = ngram_slice.len(); + let ngram = ngram_slice.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + // automaton of synonyms of the ngrams + let normalized = normalize_str(&ngram); + let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }; + + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { continue } + + if let Some(synonyms) = synonym_store.alternatives_to(reader, base.as_bytes())? { + + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); + + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { + let automaton = if nb_synonym_words == 1 { + Automaton::exact(automatons.len(), n, synonym) + } else { + Automaton::non_exact(automatons.len(), n, synonym) + }; + automatons.push(vec![automaton]); + } + } + } + } + + if n != 1 { + // automaton of concatenation of query words + let concat = ngram_slice.concat(); + let normalized = normalize_str(&concat); + + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = Automaton::exact(automatons.len(), n, &normalized); + automatons.push(vec![automaton]); + } + } + } + + // order automatons, the most important first, + // we keep the original automatons at the front. + automatons[1..].sort_unstable_by_key(|a| { + let a = a.first().unwrap(); + (Reverse(a.is_exact), Reverse(a.ngram)) + }); + + Ok((automatons, enhancer_builder.build())) +} diff --git a/src/automaton/query_enhancer.rs b/src/automaton/query_enhancer.rs new file mode 100644 index 000000000..165c1b094 --- /dev/null +++ b/src/automaton/query_enhancer.rs @@ -0,0 +1,398 @@ +use std::ops::Range; +use std::cmp::Ordering::{Less, Greater, Equal}; + +/// Return `true` if the specified range can accept the given replacements words. +/// Returns `false` if the replacements words are already present in the original query +/// or if there is fewer replacement words than the range to replace. +// +// +// ## Ignored because already present in original +// +// new york city subway +// -------- ^^^^ +// / \ +// [new york city] +// +// +// ## Ignored because smaller than the original +// +// new york city subway +// ------------- +// \ / +// [new york] +// +// +// ## Accepted because bigger than the original +// +// NYC subway +// --- +// / \ +// / \ +// / \ +// / \ +// / \ +// [new york city] +// +fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool +where S: AsRef, + T: AsRef, +{ + if words.len() <= range.len() { + // there is fewer or equal replacement words + // than there is already in the replaced range + return false + } + + // retrieve the part to rewrite but with the length + // of the replacement part + let original = query.iter().skip(range.start).take(words.len()); + + // check if the original query doesn't already contain + // the replacement words + !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) +} + +type Origin = usize; +type RealLength = usize; + +struct FakeIntervalTree { + intervals: Vec<(Range, (Origin, RealLength))>, +} + +impl FakeIntervalTree { + fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { + intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); + FakeIntervalTree { intervals } + } + + fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { + let element = self.intervals.binary_search_by(|(r, _)| { + if point >= r.start { + if point < r.end { Equal } else { Less } + } else { Greater } + }); + + let n = match element { Ok(n) => n, Err(n) => n }; + + match self.intervals.get(n) { + Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), + _otherwise => None, + } + } +} + +pub struct QueryEnhancerBuilder<'a, S> { + query: &'a [S], + origins: Vec, + real_to_origin: Vec<(Range, (Origin, RealLength))>, +} + +impl> QueryEnhancerBuilder<'_, S> { + pub fn new(query: &[S]) -> QueryEnhancerBuilder { + // we initialize origins query indices based on their positions + let origins: Vec<_> = (0..query.len() + 1).collect(); + let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); + + QueryEnhancerBuilder { query, origins, real_to_origin } + } + + /// Update the final real to origin query indices mapping. + /// + /// `range` is the original words range that this `replacement` words replace + /// and `real` is the first real query index of these replacement words. + pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) + where T: AsRef, + { + // check if the range of original words + // can be rewritten with the replacement words + if rewrite_range_with(self.query, range.clone(), replacement) { + + // this range can be replaced so we need to + // modify the origins accordingly + let offset = replacement.len() - range.len(); + + let previous_padding = self.origins[range.end - 1]; + let current_offset = (self.origins[range.end] - 1) - previous_padding; + let diff = offset.saturating_sub(current_offset); + self.origins[range.end] += diff; + + for r in &mut self.origins[range.end + 1..] { + *r += diff; + } + } + + // we need to store the real number and origins relations + // this way it will be possible to know by how many + // we need to pad real query indices + let real_range = real..real + replacement.len().max(range.len()); + let real_length = replacement.len(); + self.real_to_origin.push((real_range, (range.start, real_length))); + } + + pub fn build(self) -> QueryEnhancer { + QueryEnhancer { + origins: self.origins, + real_to_origin: FakeIntervalTree::new(self.real_to_origin), + } + } +} + +pub struct QueryEnhancer { + origins: Vec, + real_to_origin: FakeIntervalTree, +} + +impl QueryEnhancer { + /// Returns the query indices to use to replace this real query index. + pub fn replacement(&self, real: u32) -> Range { + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = + self.real_to_origin + .query(real) + .expect("real has never been declared"); + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { new_origin = origin + i; break } + } + + let n = real - range.start; + let start = self.origins[origin]; + let end = self.origins[new_origin + 1]; + let remaining = (end - start) - n; + + Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } + + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = self.origins[origin]; + + Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn original_unmodified() { + let query = ["new", "york", "city", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..2); // york + assert_eq!(enhancer.replacement(2), 2..3); // city + assert_eq!(enhancer.replacement(3), 3..4); // subway + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + } + + #[test] + fn simple_growing() { + let query = ["new", "york", "subway"]; + // 0 1 2 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 3, &["new", "york", "city"]); + // ^ 3 4 5 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..3); // york + assert_eq!(enhancer.replacement(2), 3..4); // subway + assert_eq!(enhancer.replacement(3), 0..1); // new + assert_eq!(enhancer.replacement(4), 1..2); // york + assert_eq!(enhancer.replacement(5), 2..3); // city + } + + #[test] + fn same_place_growings() { + let query = ["NY", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NY = new york + builder.declare(0..1, 2, &["new", "york"]); + // ^ 2 3 + + // NY = new york city + builder.declare(0..1, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // NY = NYC + builder.declare(0..1, 7, &["NYC"]); + // ^ 7 + + // NY = new york city + builder.declare(0..1, 8, &["new", "york", "city"]); + // ^ 8 9 10 + + // subway = underground train + builder.declare(1..2, 11, &["underground", "train"]); + // ^ 11 12 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NY + assert_eq!(enhancer.replacement(1), 3..5); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..3); // york + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + assert_eq!(enhancer.replacement(7), 0..3); // NYC + assert_eq!(enhancer.replacement(8), 0..1); // new + assert_eq!(enhancer.replacement(9), 1..2); // york + assert_eq!(enhancer.replacement(10), 2..3); // city + assert_eq!(enhancer.replacement(11), 3..4); // underground + assert_eq!(enhancer.replacement(12), 4..5); // train + } + + #[test] + fn bigger_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(0..1, 2, &["new", "york", "city"]); + // ^ 2 3 4 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NYC + assert_eq!(enhancer.replacement(1), 3..4); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..2); // york + assert_eq!(enhancer.replacement(4), 2..3); // city + } + + #[test] + fn middle_query_growing() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..6); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + } + + #[test] + fn end_query_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(1..2, 2, &["underground", "train"]); + // ^ 2 3 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // NYC + assert_eq!(enhancer.replacement(1), 1..3); // subway + assert_eq!(enhancer.replacement(2), 1..2); // underground + assert_eq!(enhancer.replacement(3), 2..3); // train + } + + #[test] + fn multiple_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + } + + #[test] + fn multiple_probable_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + // great awesome = good + builder.declare(0..2, 9, &["good"]); + // ^ 9 + + // awesome NYC = NY + builder.declare(1..3, 10, &["NY"]); + // ^^ 10 + + // NYC subway = metro + builder.declare(2..4, 11, &["metro"]); + // ^^ 11 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + assert_eq!(enhancer.replacement(9), 0..2); // good + assert_eq!(enhancer.replacement(10), 1..5); // NY + assert_eq!(enhancer.replacement(11), 2..5); // metro + } +} diff --git a/src/criterion/document_id.rs b/src/criterion/document_id.rs new file mode 100644 index 000000000..34d0bd7f5 --- /dev/null +++ b/src/criterion/document_id.rs @@ -0,0 +1,16 @@ +use std::cmp::Ordering; +use crate::criterion::Criterion; +use crate::RawDocument; + +#[derive(Debug, Clone, Copy)] +pub struct DocumentId; + +impl Criterion for DocumentId { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + lhs.id.cmp(&rhs.id) + } + + fn name(&self) -> &'static str { + "DocumentId" + } +} diff --git a/src/criterion/exact.rs b/src/criterion/exact.rs new file mode 100644 index 000000000..bde3ca733 --- /dev/null +++ b/src/criterion/exact.rs @@ -0,0 +1,65 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::criterion::Criterion; +use crate::RawDocument; + +#[inline] +fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { + let mut count = 0; + let mut index = 0; + + for group in query_index.linear_group() { + let len = group.len(); + count += is_exact[index..index + len].contains(&true) as usize; + index += len; + } + + count +} + +#[derive(Debug, Clone, Copy)] +pub struct Exact; + +impl Criterion for Exact { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let is_exact = lhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; + + let rhs = { + let query_index = rhs.query_index(); + let is_exact = rhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; + + lhs.cmp(&rhs).reverse() + } + + fn name(&self) -> &'static str { + "Exact" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: "Soulier bleu" + // doc1: "souliereres rouge" + #[test] + fn easy_case() { + let query_index0 = &[0]; + let is_exact0 = &[true]; + + let query_index1 = &[0]; + let is_exact1 = &[false]; + + let doc0 = number_exact_matches(query_index0, is_exact0); + let doc1 = number_exact_matches(query_index1, is_exact1); + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } +} diff --git a/src/criterion/mod.rs b/src/criterion/mod.rs new file mode 100644 index 000000000..6ce42007c --- /dev/null +++ b/src/criterion/mod.rs @@ -0,0 +1,120 @@ +mod sum_of_typos; +mod number_of_words; +mod words_proximity; +mod sum_of_words_attribute; +mod sum_of_words_position; +mod exact; +mod document_id; + +use std::cmp::Ordering; +use crate::RawDocument; + +pub use self::{ + sum_of_typos::SumOfTypos, + number_of_words::NumberOfWords, + words_proximity::WordsProximity, + sum_of_words_attribute::SumOfWordsAttribute, + sum_of_words_position::SumOfWordsPosition, + exact::Exact, + document_id::DocumentId, +}; + +pub trait Criterion: Send + Sync { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; + + fn name(&self) -> &'static str; + + #[inline] + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + self.evaluate(lhs, rhs) == Ordering::Equal + } +} + +impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) + } + + fn name(&self) -> &'static str { + (**self).name() + } + + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) + } +} + +impl Criterion for Box { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) + } + + fn name(&self) -> &'static str { + (**self).name() + } + + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) + } +} + +#[derive(Default)] +pub struct CriteriaBuilder<'a> { + inner: Vec> +} + +impl<'a> CriteriaBuilder<'a> +{ + pub fn new() -> CriteriaBuilder<'a> { + CriteriaBuilder { inner: Vec::new() } + } + + pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> { + CriteriaBuilder { inner: Vec::with_capacity(capacity) } + } + + pub fn reserve(&mut self, additional: usize) { + self.inner.reserve(additional) + } + + pub fn add(mut self, criterion: C) -> CriteriaBuilder<'a> + where C: Criterion, + { + self.push(criterion); + self + } + + pub fn push(&mut self, criterion: C) + where C: Criterion, + { + self.inner.push(Box::new(criterion)); + } + + pub fn build(self) -> Criteria<'a> { + Criteria { inner: self.inner } + } +} + +pub struct Criteria<'a> { + inner: Vec>, +} + +impl<'a> Default for Criteria<'a> { + fn default() -> Self { + CriteriaBuilder::with_capacity(7) + .add(SumOfTypos) + .add(NumberOfWords) + .add(WordsProximity) + .add(SumOfWordsAttribute) + .add(SumOfWordsPosition) + .add(Exact) + .add(DocumentId) + .build() + } +} + +impl<'a> AsRef<[Box]> for Criteria<'a> { + fn as_ref(&self) -> &[Box] { + &self.inner + } +} diff --git a/src/criterion/number_of_words.rs b/src/criterion/number_of_words.rs new file mode 100644 index 000000000..43095a066 --- /dev/null +++ b/src/criterion/number_of_words.rs @@ -0,0 +1,31 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::criterion::Criterion; +use crate::RawDocument; + +#[inline] +fn number_of_query_words(query_index: &[u32]) -> usize { + query_index.linear_group().count() +} + +#[derive(Debug, Clone, Copy)] +pub struct NumberOfWords; + +impl Criterion for NumberOfWords { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + number_of_query_words(query_index) + }; + let rhs = { + let query_index = rhs.query_index(); + number_of_query_words(query_index) + }; + + lhs.cmp(&rhs).reverse() + } + + fn name(&self) -> &'static str { + "NumberOfWords" + } +} diff --git a/src/criterion/sum_of_typos.rs b/src/criterion/sum_of_typos.rs new file mode 100644 index 000000000..6736e6caa --- /dev/null +++ b/src/criterion/sum_of_typos.rs @@ -0,0 +1,116 @@ +use std::cmp::Ordering; + +use slice_group_by::GroupBy; + +use crate::criterion::Criterion; +use crate::RawDocument; + +// This function is a wrong logarithmic 10 function. +// It is safe to panic on input number higher than 3, +// the number of typos is never bigger than that. +#[inline] +fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } +} + +#[inline] +fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + let mut index = 0; + + for group in query_index.linear_group() { + sum_typos += custom_log10(distance[index]); + number_words += 1; + index += group.len(); + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize +} + +#[derive(Debug, Clone, Copy)] +pub struct SumOfTypos; + +impl Criterion for SumOfTypos { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let distance = lhs.distance(); + sum_matches_typos(query_index, distance) + }; + + let rhs = { + let query_index = rhs.query_index(); + let distance = rhs.distance(); + sum_matches_typos(query_index, distance) + }; + + lhs.cmp(&rhs).reverse() + } + + fn name(&self) -> &'static str { + "SumOfTypos" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "Geox CEO" + // + // doc0: "Geox SpA: CEO and Executive" + // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" + #[test] + fn one_typo_reference() { + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; + + let query_index1 = &[0, 1]; + let distance1 = &[1, 0]; + + let doc0 = sum_matches_typos(query_index0, distance0); + let doc1 = sum_matches_typos(query_index1, distance1); + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } + + // typing: "bouton manchette" + // + // doc0: "bouton manchette" + // doc1: "bouton" + #[test] + fn no_typo() { + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; + + let query_index1 = &[0]; + let distance1 = &[0]; + + let doc0 = sum_matches_typos(query_index0, distance0); + let doc1 = sum_matches_typos(query_index1, distance1); + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } + + // typing: "bouton manchztte" + // + // doc0: "bouton manchette" + // doc1: "bouton" + #[test] + fn one_typo() { + let query_index0 = &[0, 1]; + let distance0 = &[0, 1]; + + let query_index1 = &[0]; + let distance1 = &[0]; + + let doc0 = sum_matches_typos(query_index0, distance0); + let doc1 = sum_matches_typos(query_index1, distance1); + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } +} diff --git a/src/criterion/sum_of_words_attribute.rs b/src/criterion/sum_of_words_attribute.rs new file mode 100644 index 000000000..d5787ef3a --- /dev/null +++ b/src/criterion/sum_of_words_attribute.rs @@ -0,0 +1,64 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::criterion::Criterion; +use crate::RawDocument; + +#[inline] +fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { + let mut sum_attributes = 0; + let mut index = 0; + + for group in query_index.linear_group() { + sum_attributes += attribute[index] as usize; + index += group.len(); + } + + sum_attributes +} + +#[derive(Debug, Clone, Copy)] +pub struct SumOfWordsAttribute; + +impl Criterion for SumOfWordsAttribute { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; + + lhs.cmp(&rhs) + } + + fn name(&self) -> &'static str { + "SumOfWordsAttribute" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } + // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } + #[test] + fn title_vs_description() { + let query_index0 = &[0]; + let attribute0 = &[0]; + + let query_index1 = &[0]; + let attribute1 = &[1]; + + let doc0 = sum_matches_attributes(query_index0, attribute0); + let doc1 = sum_matches_attributes(query_index1, attribute1); + assert_eq!(doc0.cmp(&doc1), Ordering::Less); + } +} diff --git a/src/criterion/sum_of_words_position.rs b/src/criterion/sum_of_words_position.rs new file mode 100644 index 000000000..13f26774c --- /dev/null +++ b/src/criterion/sum_of_words_position.rs @@ -0,0 +1,64 @@ +use std::cmp::Ordering; +use slice_group_by::GroupBy; +use crate::criterion::Criterion; +use crate::RawDocument; + +#[inline] +fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { + let mut sum_word_index = 0; + let mut index = 0; + + for group in query_index.linear_group() { + sum_word_index += word_index[index] as usize; + index += group.len(); + } + + sum_word_index +} + +#[derive(Debug, Clone, Copy)] +pub struct SumOfWordsPosition; + +impl Criterion for SumOfWordsPosition { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let word_index = lhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let word_index = rhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; + + lhs.cmp(&rhs) + } + + fn name(&self) -> &'static str { + "SumOfWordsPosition" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "soulier" + // + // doc0: "Soulier bleu" + // doc1: "Botte rouge et soulier noir" + #[test] + fn easy_case() { + let query_index0 = &[0]; + let word_index0 = &[0]; + + let query_index1 = &[0]; + let word_index1 = &[3]; + + let doc0 = sum_matches_attribute_index(query_index0, word_index0); + let doc1 = sum_matches_attribute_index(query_index1, word_index1); + assert_eq!(doc0.cmp(&doc1), Ordering::Less); + } +} diff --git a/src/criterion/words_proximity.rs b/src/criterion/words_proximity.rs new file mode 100644 index 000000000..10f167bef --- /dev/null +++ b/src/criterion/words_proximity.rs @@ -0,0 +1,155 @@ +use std::cmp::{self, Ordering}; +use slice_group_by::GroupBy; +use crate::criterion::Criterion; +use crate::RawDocument; + +const MAX_DISTANCE: u16 = 8; + +#[inline] +fn clone_tuple((a, b): (&T, &U)) -> (T, U) { + (a.clone(), b.clone()) +} + +fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } +} + +fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { + if lattr != rattr { return MAX_DISTANCE } + index_proximity(lwi, rwi) +} + +fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { + let mut min_prox = u16::max_value(); + + for a in lattr.iter().zip(lwi) { + for b in rattr.iter().zip(rwi) { + let a = clone_tuple(a); + let b = clone_tuple(b); + min_prox = cmp::min(min_prox, attribute_proximity(a, b)); + } + } + + min_prox +} + +fn matches_proximity( + query_index: &[u32], + distance: &[u8], + attribute: &[u16], + word_index: &[u16], +) -> u16 +{ + let mut query_index_groups = query_index.linear_group(); + let mut proximity = 0; + let mut index = 0; + + let get_attr_wi = |index: usize, group_len: usize| { + // retrieve the first distance group (with the lowest values) + let len = distance[index..index + group_len].linear_group().next().unwrap().len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + + (rattr, rwi) + }; + + let mut last = query_index_groups.next().map(|group| { + let attr_wi = get_attr_wi(index, group.len()); + index += group.len(); + attr_wi + }); + + // iter by windows of size 2 + while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { + let attr_wi = get_attr_wi(index, rhs.len()); + proximity += min_proximity(lhs, attr_wi); + last = Some(attr_wi); + index += rhs.len(); + } + + proximity +} + +#[derive(Debug, Clone, Copy)] +pub struct WordsProximity; + +impl Criterion for WordsProximity { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let distance = lhs.distance(); + let attribute = lhs.attribute(); + let word_index = lhs.word_index(); + matches_proximity(query_index, distance, attribute, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let distance = rhs.distance(); + let attribute = rhs.attribute(); + let word_index = rhs.word_index(); + matches_proximity(query_index, distance, attribute, word_index) + }; + + lhs.cmp(&rhs) + } + + fn name(&self) -> &'static str { + "WordsProximity" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn three_different_attributes() { + + // "soup" "of the" "the day" + // + // { id: 0, attr: 0, attr_index: 0 } + // { id: 1, attr: 1, attr_index: 0 } + // { id: 2, attr: 1, attr_index: 1 } + // { id: 2, attr: 2, attr_index: 0 } + // { id: 3, attr: 3, attr_index: 1 } + + let query_index = &[0, 1, 2, 2, 3]; + let distance = &[0, 0, 0, 0, 0]; + let attribute = &[0, 1, 1, 2, 3]; + let word_index = &[0, 0, 1, 0, 1]; + + // soup -> of = 8 + // + of -> the = 1 + // + the -> day = 8 (not 1) + assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17); + } + + #[test] + fn two_different_attributes() { + + // "soup day" "soup of the day" + // + // { id: 0, attr: 0, attr_index: 0 } + // { id: 0, attr: 1, attr_index: 0 } + // { id: 1, attr: 1, attr_index: 1 } + // { id: 2, attr: 1, attr_index: 2 } + // { id: 3, attr: 0, attr_index: 1 } + // { id: 3, attr: 1, attr_index: 3 } + + let query_index = &[0, 0, 1, 2, 3, 3]; + let distance = &[0, 0, 0, 0, 0, 0]; + let attribute = &[0, 1, 1, 1, 0, 1]; + let word_index = &[0, 0, 1, 2, 1, 3]; + + // soup -> of = 1 + // + of -> the = 1 + // + the -> day = 1 + assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..a39159ce4 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,135 @@ +mod automaton; +mod query_builder; +mod raw_document; +mod reordered_attrs; +pub mod criterion; +pub mod raw_indexer; +pub mod store; + +pub use self::query_builder::QueryBuilder; +pub use self::raw_document::RawDocument; + +use zerocopy::{AsBytes, FromBytes}; + +pub type BEI64 = zerocopy::I64; + +/// Represent an internally generated document unique identifier. +/// +/// It is used to inform the database the document you want to deserialize. +/// Helpful for custom ranking. +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[derive(AsBytes, FromBytes)] +#[repr(C)] +pub struct DocumentId(pub i64); + +/// This structure represent the position of a word +/// in a document and its attributes. +/// +/// This is stored in the map, generated at index time, +/// extracted and interpreted at search time. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(AsBytes, FromBytes)] +#[repr(C)] +pub struct DocIndex { + /// The document identifier where the word was found. + pub document_id: DocumentId, + + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + pub word_index: u16, + + /// The position in bytes where the word was found + /// along with the length of it. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + pub char_length: u16, +} + +/// This structure represent a matching word with informations +/// on the location of the word in the document. +/// +/// The order of the field is important because it defines +/// the way these structures are ordered between themselves. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Highlight { + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + + /// The position in bytes where the word was found. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + + /// The length in bytes of the found word. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_length: u16, +} + +#[doc(hidden)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct TmpMatch { + pub query_index: u32, + pub distance: u8, + pub attribute: u16, + pub word_index: u16, + pub is_exact: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Document { + pub id: DocumentId, + pub highlights: Vec, + + #[cfg(test)] + pub matches: Vec, +} + +impl Document { + #[cfg(not(test))] + fn from_raw(raw: RawDocument) -> Document { + Document { id: raw.id, highlights: raw.highlights } + } + + #[cfg(test)] + fn from_raw(raw: RawDocument) -> Document { + let len = raw.query_index().len(); + let mut matches = Vec::with_capacity(len); + + let query_index = raw.query_index(); + let distance = raw.distance(); + let attribute = raw.attribute(); + let word_index = raw.word_index(); + let is_exact = raw.is_exact(); + + for i in 0..len { + let match_ = TmpMatch { + query_index: query_index[i], + distance: distance[i], + attribute: attribute[i], + word_index: word_index[i], + is_exact: is_exact[i], + }; + matches.push(match_); + } + + Document { id: raw.id, matches, highlights: raw.highlights } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::mem; + + #[test] + fn docindex_mem_size() { + assert_eq!(mem::size_of::(), 16); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..65cfce3ca --- /dev/null +++ b/src/main.rs @@ -0,0 +1,52 @@ +use rkv::{Manager, Rkv, SingleStore, Value, StoreOptions}; +use std::{fs, path::Path}; + +use meilidb_schema::SchemaAttr; +use new_meilidb::{store, QueryBuilder, DocumentId}; +use new_meilidb::raw_indexer::{RawIndexer, Indexed}; + +fn main() { + let path = Path::new("test.rkv"); + fs::create_dir_all(path).unwrap(); + + // The Manager enforces that each process opens the same environment + // at most once by caching a handle to each environment that it opens. + // Use it to retrieve the handle to an opened environment—or create one + // if it hasn't already been opened: + let created_arc = Manager::singleton().write().unwrap().get_or_create(path, Rkv::new).unwrap(); + let env = created_arc.read().unwrap(); + + let (words, synonyms) = store::create(&env, "test").unwrap(); + + { + let mut writer = env.write().unwrap(); + let mut raw_indexer = RawIndexer::new(); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + raw_indexer.index_text(docid, attr, text); + + let Indexed { words_doc_indexes, .. } = raw_indexer.build(); + + let mut fst_builder = fst::SetBuilder::memory(); + fst_builder.extend_iter(words_doc_indexes.keys()); + let bytes = fst_builder.into_inner().unwrap(); + let fst = fst::raw::Fst::from_bytes(bytes).unwrap(); + let fst = fst::Set::from(fst); + + words.put_words_fst(&mut writer, &fst).unwrap(); + + for (word, indexes) in words_doc_indexes { + words.put_words_indexes(&mut writer, &word, &indexes).unwrap(); + } + + writer.commit().unwrap(); + } + + let reader = env.read().unwrap(); + let builder = QueryBuilder::new(words, synonyms); + let documents = builder.query(&reader, "oubli", 0..20).unwrap(); + + println!("{:?}", documents); +} diff --git a/src/query_builder.rs b/src/query_builder.rs new file mode 100644 index 000000000..9cb91b755 --- /dev/null +++ b/src/query_builder.rs @@ -0,0 +1,275 @@ +use std::time::{Instant, Duration}; +use std::ops::Range; +use std::{cmp, mem}; + +use fst::{IntoStreamer, Streamer}; +use sdset::SetBuf; +use slice_group_by::{GroupBy, GroupByMut}; + +use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; +use crate::raw_document::{RawDocument, raw_documents_from}; +use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria}; +use crate::{store, reordered_attrs::ReorderedAttrs}; + +pub struct Automatons { + // TODO better use Vec of SmallVec + automatons: Vec>, +} + +pub struct QueryBuilder<'a> { + criteria: Criteria<'a>, + searchables_attrs: Option, + timeout: Duration, + words_store: store::Words, + synonyms_store: store::Synonyms, +} + +fn multiword_rewrite_matches( + mut matches: Vec<(DocumentId, TmpMatch)>, + query_enhancer: &QueryEnhancer, +) -> SetBuf<(DocumentId, TmpMatch)> +{ + let mut padded_matches = Vec::with_capacity(matches.len()); + + // we sort the matches by word index to make them rewritable + matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + + let start = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { + + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + + // find the biggest padding + let mut biggest = 0; + for (id, match_) in same_word_index { + + let mut replacement = query_enhancer.replacement(match_.query_index); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; + + for (_, nmatch_) in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index); + let query_index = rep.next().unwrap(); + if query_index == padmatch.query_index { + + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push((*id, padmatch)); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { + document_matches.sort_unstable(); + } + + SetBuf::new_unchecked(padded_matches) +} + +fn fetch_raw_documents( + reader: &rkv::Reader, + automatons: &[Automaton], + query_enhancer: &QueryEnhancer, + searchables: Option<&ReorderedAttrs>, + words_store: &store::Words, +) -> Result, rkv::StoreError> +{ + let mut matches = Vec::new(); + let mut highlights = Vec::new(); + + for automaton in automatons { + let Automaton { index, is_exact, query_len, .. } = automaton; + let dfa = automaton.dfa(); + + let words = words_store.words_fst(reader)?; + + let mut stream = words.search(&dfa).into_stream(); + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == *query_len; + + let doc_indexes = match words_store.word_indexes(reader, input)? { + Some(doc_indexes) => doc_indexes, + None => continue, + }; + + matches.reserve(doc_indexes.len()); + highlights.reserve(doc_indexes.len()); + + for di in doc_indexes.as_ref() { + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: *index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, + }; + + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, + }; + + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); + } + } + } + } + + let matches = multiword_rewrite_matches(matches, &query_enhancer); + let highlights = { + highlights.sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; + + Ok(raw_documents_from(matches, highlights)) +} + +impl<'a> QueryBuilder<'a> { + pub fn new(words: store::Words, synonyms: store::Synonyms) -> QueryBuilder<'a> { + QueryBuilder { + criteria: Criteria::default(), + searchables_attrs: None, + timeout: Duration::from_secs(1), + words_store: words, + synonyms_store: synonyms, + } + } + + pub fn query( + self, + reader: &rkv::Reader, + query: &str, + range: Range, + ) -> Result, rkv::StoreError> + { + let start_processing = Instant::now(); + let mut raw_documents_processed = Vec::new(); + + let (automaton_producer, query_enhancer) = AutomatonProducer::new(reader, query, self.synonyms_store); + let mut automaton_producer = automaton_producer.into_iter(); + let mut automatons = Vec::new(); + + // aggregate automatons groups by groups after time + while let Some(auts) = automaton_producer.next() { + automatons.extend(auts); + + // we must retrieve the documents associated + // with the current automatons + let mut raw_documents = fetch_raw_documents( + reader, + &automatons, + &query_enhancer, + self.searchables_attrs.as_ref(), + &self.words_store, + )?; + + let mut groups = vec![raw_documents.as_mut_slice()]; + + 'criteria: for criterion in self.criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < range.start { + documents_seen += group.len(); + groups.push(group); + continue; + } + + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { continue 'criteria } + } + } + } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + raw_documents_processed.clear(); + raw_documents_processed.extend(iter); + + // stop processing after there is no time + if start_processing.elapsed() > self.timeout { break } + } + + // make real documents now that we know + // those must be returned + let documents = raw_documents_processed + .into_iter() + .map(|d| Document::from_raw(d)) + .collect(); + + Ok(documents) + } +} diff --git a/src/raw_document.rs b/src/raw_document.rs new file mode 100644 index 000000000..3567c3fd1 --- /dev/null +++ b/src/raw_document.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; +use std::fmt; +use sdset::SetBuf; +use slice_group_by::GroupBy; +use crate::{TmpMatch, DocumentId, Highlight}; + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, + pub highlights: Vec, +} + +impl RawDocument { + fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { + RawDocument { id, matches, highlights } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } +} + +impl fmt::Debug for RawDocument { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("RawDocument {\r\n")?; + f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_str("}")?; + Ok(()) + } +} + +pub fn raw_documents_from( + matches: SetBuf<(DocumentId, TmpMatch)>, + highlights: SetBuf<(DocumentId, Highlight)>, +) -> Vec +{ + let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + let matches = matches.linear_group_by_key(|(id, _)| *id); + let highlights = highlights.linear_group_by_key(|(id, _)| *id); + + for (mgroup, hgroup) in matches.zip(highlights) { + debug_assert_eq!(mgroup[0].0, hgroup[0].0); + + let document_id = mgroup[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let end = start + mgroup.len(); + + let highlights = hgroup.iter().map(|(_, h)| *h).collect(); + docs_ranges.push((document_id, Range { start, end }, highlights)); + + matches2.extend_from_slice(mgroup); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(id, range, highlights)| { + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument::new(id, matches, highlights) + }).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { + for (_, match_) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + } + } +} diff --git a/src/raw_indexer.rs b/src/raw_indexer.rs new file mode 100644 index 000000000..9c0399be5 --- /dev/null +++ b/src/raw_indexer.rs @@ -0,0 +1,208 @@ +use std::collections::{BTreeMap, HashMap}; +use std::convert::TryFrom; + +use deunicode::deunicode_with_tofu; +use crate::{DocumentId, DocIndex}; +use meilidb_schema::SchemaAttr; +use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; +use sdset::SetBuf; + +type Word = Vec; // TODO make it be a SmallVec + +pub struct RawIndexer { + word_limit: usize, // the maximum number of indexed words + words_doc_indexes: BTreeMap>, + docs_words: HashMap>, +} + +pub struct Indexed { + pub words_doc_indexes: BTreeMap>, + pub docs_words: HashMap, +} + +impl RawIndexer { + pub fn new() -> RawIndexer { + RawIndexer::with_word_limit(1000) + } + + pub fn with_word_limit(limit: usize) -> RawIndexer { + RawIndexer { + word_limit: limit, + words_doc_indexes: BTreeMap::new(), + docs_words: HashMap::new(), + } + } + + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { + let lowercase_text = text.to_lowercase(); + let deunicoded = deunicode_with_tofu(&lowercase_text, ""); + + // TODO compute the deunicoded version after the cjk check + let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded { + Some(deunicoded) + } else { + None + }; + let iter = Some(lowercase_text).into_iter().chain(next); + + for text in iter { + for token in Tokenizer::new(&text) { + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { break } + } + } + } + + pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where I: IntoIterator, + IT: Iterator + Clone, + { + // TODO serialize this to one call to the SeqTokenizer loop + + let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect(); + let iter = lowercased.iter().map(|t| t.as_str()); + + for token in SeqTokenizer::new(iter) { + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { break } + } + + let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| { + if lowercase_text.contains(is_cjk) { return lowercase_text } + let deunicoded = deunicode_with_tofu(&lowercase_text, ""); + if lowercase_text != deunicoded { deunicoded } else { lowercase_text } + }).collect(); + let iter = deunicoded.iter().map(|t| t.as_str()); + + for token in SeqTokenizer::new(iter) { + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { break } + } + } + + pub fn build(self) -> Indexed { + let words_doc_indexes = self.words_doc_indexes + .into_iter() + .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes))) + .collect(); + + let docs_words = self.docs_words + .into_iter() + .map(|(id, mut words)| { + words.sort_unstable(); + words.dedup(); + (id, fst::Set::from_iter(words).unwrap()) + }) + .collect(); + + Indexed { words_doc_indexes, docs_words } + } +} + +fn index_token( + token: Token, + id: DocumentId, + attr: SchemaAttr, + word_limit: usize, + words_doc_indexes: &mut BTreeMap>, + docs_words: &mut HashMap>, +) -> bool +{ + if token.word_index >= word_limit { return false } + + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + }, + None => return false, + } + + true +} + +fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option { + let word_index = u16::try_from(token.word_index).ok()?; + let char_index = u16::try_from(token.char_index).ok()?; + let char_length = u16::try_from(token.word.chars().count()).ok()?; + + let docindex = DocIndex { + document_id: id, + attribute: attr.0, + word_index, + char_index, + char_length, + }; + + Some(docindex) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strange_apostrophe() { + let mut indexer = RawIndexer::new(); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + indexer.index_text(docid, attr, text); + + let Indexed { words_doc_indexes, .. } = indexer.build(); + + assert!(words_doc_indexes.get(&b"l"[..]).is_some()); + assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); + assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); + assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); + + // with the ugly apostrophe... + assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some()); + } + + #[test] + fn strange_apostrophe_in_sequence() { + let mut indexer = RawIndexer::new(); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"]; + indexer.index_text_seq(docid, attr, text); + + let Indexed { words_doc_indexes, .. } = indexer.build(); + + assert!(words_doc_indexes.get(&b"l"[..]).is_some()); + assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); + assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); + assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); + + // with the ugly apostrophe... + assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some()); + } +} diff --git a/src/reordered_attrs.rs b/src/reordered_attrs.rs new file mode 100644 index 000000000..ed11045ab --- /dev/null +++ b/src/reordered_attrs.rs @@ -0,0 +1,24 @@ +#[derive(Default, Clone)] +pub struct ReorderedAttrs { + count: usize, + reorders: Vec>, +} + +impl ReorderedAttrs { + pub fn new() -> ReorderedAttrs { + ReorderedAttrs { count: 0, reorders: Vec::new() } + } + + pub fn insert_attribute(&mut self, attribute: u16) { + self.reorders.resize(attribute as usize + 1, None); + self.reorders[attribute as usize] = Some(self.count as u16); + self.count += 1; + } + + pub fn get(&self, attribute: u16) -> Option { + match self.reorders.get(attribute as usize) { + Some(Some(attribute)) => Some(*attribute), + _ => None, + } + } +} diff --git a/src/store/mod.rs b/src/store/mod.rs new file mode 100644 index 000000000..9c6620484 --- /dev/null +++ b/src/store/mod.rs @@ -0,0 +1,26 @@ +mod words; +mod synonyms; + +pub use self::words::Words; +pub use self::synonyms::Synonyms; + +const SCHEMA_KEY: &str = "schema"; +const WORDS_KEY: &str = "words"; +const SYNONYMS_KEY: &str = "synonyms"; +const RANKED_MAP_KEY: &str = "ranked-map"; +const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; + +fn aligned_to(bytes: &[u8], align: usize) -> bool { + (bytes as *const _ as *const () as usize) % align == 0 +} + +pub fn create(env: &rkv::Rkv, name: &str) -> Result<(Words, Synonyms), rkv::StoreError> { + let main = env.open_single(name, rkv::StoreOptions::create())?; + let words_indexes = env.open_single(format!("{}-words-indexes", name).as_str(), rkv::StoreOptions::create())?; + let synonyms = env.open_single(format!("{}-synonyms", name).as_str(), rkv::StoreOptions::create())?; + + let words = Words { main, words_indexes }; + let synonyms = Synonyms { main, synonyms }; + + Ok((words, synonyms)) +} diff --git a/src/store/synonyms.rs b/src/store/synonyms.rs new file mode 100644 index 000000000..4cf1186cc --- /dev/null +++ b/src/store/synonyms.rs @@ -0,0 +1,23 @@ +pub struct Synonyms { + pub(crate) main: rkv::SingleStore, + pub(crate) synonyms: rkv::SingleStore, +} + +impl Synonyms { + pub fn synonyms_fst( + &self, + reader: &T, + ) -> Result + { + Ok(fst::Set::default()) + } + + pub fn alternatives_to( + &self, + reader: &T, + word: &[u8], + ) -> Result, rkv::StoreError> + { + unimplemented!() + } +} diff --git a/src/store/words.rs b/src/store/words.rs new file mode 100644 index 000000000..face8a979 --- /dev/null +++ b/src/store/words.rs @@ -0,0 +1,91 @@ +use std::borrow::Cow; +use std::sync::Arc; +use std::{mem, ptr}; +use zerocopy::{AsBytes, LayoutVerified}; + +use crate::DocIndex; +use crate::store::aligned_to; +use crate::store::WORDS_KEY; + +pub struct Words { + pub(crate) main: rkv::SingleStore, + pub(crate) words_indexes: rkv::SingleStore, +} + +impl Words { + pub fn put_words_fst( + &self, + writer: &mut rkv::Writer, + fst: &fst::Set, + ) -> Result<(), rkv::StoreError> + { + let blob = rkv::Value::Blob(fst.as_fst().as_bytes()); + self.main.put(writer, WORDS_KEY, &blob) + } + + pub fn words_fst( + &self, + reader: &T, + ) -> Result + { + match self.main.get(reader, WORDS_KEY)? { + Some(rkv::Value::Blob(bytes)) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); + Ok(fst::Set::from(fst)) + }, + Some(value) => panic!("invalid type {:?}", value), + None => panic!("could not find word index"), + } + } + + pub fn put_words_indexes( + &self, + writer: &mut rkv::Writer, + word: &[u8], + words_indexes: &[DocIndex], + ) -> Result<(), rkv::StoreError> + { + let blob = rkv::Value::Blob(words_indexes.as_bytes()); + self.main.put(writer, word, &blob) + } + + pub fn word_indexes<'a, T: rkv::Readable>( + &self, + reader: &'a T, + word: &[u8], + ) -> Result>, rkv::StoreError> + { + let bytes = match self.main.get(reader, word)? { + Some(rkv::Value::Blob(bytes)) => bytes, + Some(value) => panic!("invalid type {:?}", value), + None => return Ok(None), + }; + + match LayoutVerified::new_slice(bytes) { + Some(layout) => Ok(Some(Cow::Borrowed(layout.into_slice()))), + None => { + let len = bytes.len(); + let elem_size = mem::size_of::(); + + // ensure that it is the alignment that is wrong + // and the length is valid + if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::()) { + let elems = len / elem_size; + let mut vec = Vec::::with_capacity(elems); + + unsafe { + let dst = vec.as_mut_ptr() as *mut u8; + ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len); + vec.set_len(elems); + } + + return Ok(Some(Cow::Owned(vec))) + } + + Ok(None) + }, + } + } +}