Cargo fmt pass

This commit is contained in:
Clément Renault 2019-10-18 13:05:28 +02:00
parent 47d777c8f7
commit ca26a0f2e4
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
48 changed files with 1599 additions and 979 deletions

View File

@ -4,15 +4,15 @@ use std::error::Error;
use std::io::Write;
use std::iter::FromIterator;
use std::path::{Path, PathBuf};
use std::time::{Instant, Duration};
use std::time::{Duration, Instant};
use std::{fs, io, sync::mpsc};
use rustyline::{Editor, Config};
use serde::{Serialize, Deserialize};
use rustyline::{Config, Editor};
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Highlight, Database, UpdateResult};
use meilidb_core::{Database, Highlight, UpdateResult};
use meilidb_schema::SchemaAttr;
const INDEX_NAME: &str = "default";
@ -91,7 +91,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) {
Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap()
None => database.create_index(INDEX_NAME).unwrap(),
};
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
@ -108,14 +108,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
match index.main.schema(&writer)? {
Some(current_schema) => {
if current_schema != schema {
return Err(meilidb_core::Error::SchemaDiffer.into())
return Err(meilidb_core::Error::SchemaDiffer.into());
}
writer.abort();
},
}
None => {
index.schema_update(&mut writer, schema)?;
writer.commit().unwrap();
},
}
}
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
@ -131,7 +131,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
if end_of_file {
break;
}
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
@ -147,7 +149,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
i += 1;
if let Some(group_size) = command.update_group_size {
if i % group_size == 0 { break }
if i % group_size == 0 {
break;
}
}
}
@ -163,15 +167,25 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
println!("Waiting for update {}", max_update_id);
for id in receiver {
if id == max_update_id { break }
if id == max_update_id {
break;
}
}
println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path);
println!(
"database created in {:.2?} at: {:?}",
start.elapsed(),
command.database_path
);
if let Some(path) = command.compact_to_path {
let start = Instant::now();
let _file = database.copy_and_compact_to_path(&path)?;
println!("database compacted in {:.2?} at: {:?}", start.elapsed(), path);
println!(
"database compacted in {:.2?} at: {:?}",
start.elapsed(),
path
);
}
Ok(())
@ -182,7 +196,10 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
let [start, end] = match range {
[start, end] => [*start, *end],
_ => unreachable!(),
};
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
@ -221,12 +238,14 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Vacant(entry) => {
entry.insert(byte_length);
}
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
}
@ -252,22 +271,23 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
/// ```
fn crop_text(
text: &str,
highlights: impl IntoIterator<Item=Highlight>,
highlights: impl IntoIterator<Item = Highlight>,
context: usize,
) -> (String, Vec<Highlight>)
{
) -> (String, Vec<Highlight>) {
let mut highlights = highlights.into_iter().peekable();
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
let char_index = highlights
.peek()
.map(|m| m.char_index as usize)
.unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights
.take_while(|m| {
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
})
.map(|highlight| {
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
.map(|highlight| Highlight {
char_index: highlight.char_index - start as u16,
..highlight
})
.collect();
@ -276,7 +296,9 @@ fn crop_text(
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let index = database.open_index(INDEX_NAME).expect("Could not find index");
let index = database
.open_index(INDEX_NAME)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let schema = index.main.schema(&reader)?;
@ -312,10 +334,15 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
(true, filter)
};
let attr = schema.attribute(&filter).expect("Could not find filtered attribute");
let attr = schema
.attribute(&filter)
.expect("Could not find filtered attribute");
builder.with_filter(move |document_id| {
let string: String = ref_index.document_attribute(ref_reader, document_id, attr).unwrap().unwrap();
let string: String = ref_index
.document_attribute(ref_reader, document_id, attr)
.unwrap()
.unwrap();
(string == "true") == positive
});
}
@ -326,8 +353,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
let number_of_documents = documents.len();
for mut doc in documents {
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
doc.highlights
.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now();
let result = index.document::<Document>(&reader, Some(&fields), doc.id);
@ -340,15 +367,18 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let highlights = doc.highlights.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) = crop_text(&text, highlights, command.char_context);
let highlights = doc
.highlights
.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) =
crop_text(&text, highlights, command.char_context);
let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?;
println!();
}
},
}
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e),
}
@ -366,12 +396,19 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
println!();
}
eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration);
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
},
eprintln!(
"whole documents fields retrieve took {:.2?}",
retrieve_duration
);
eprintln!(
"===== Found {} results in {:.2?} =====",
number_of_documents,
start_total.elapsed()
);
}
Err(err) => {
println!("Error: {:?}", err);
break
break;
}
}
}

View File

@ -1,8 +1,5 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::OnceCell;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA,
};
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
@ -15,30 +12,30 @@ enum PrefixSetting {
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use PrefixSetting::{Prefix, NoPrefix};
use PrefixSetting::{NoPrefix, Prefix};
match query.len() {
0 ..= 4 => {
0..=4 => {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
},
5 ..= 8 => {
}
5..=8 => {
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
},
}
_ => {
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
match setting {
Prefix => builder.build_prefix_dfa(query),
Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query),
}
},
}
}
}

View File

@ -6,14 +6,14 @@ use std::vec;
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
use meilidb_tokenizer::{split_query_string, is_cjk};
use meilidb_tokenizer::{is_cjk, split_query_string};
use crate::store;
use crate::error::MResult;
use crate::store;
use self::dfa::{build_dfa, build_prefix_dfa};
use self::query_enhancer::QueryEnhancerBuilder;
pub use self::query_enhancer::QueryEnhancer;
use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3;
@ -27,14 +27,9 @@ impl AutomatonProducer {
query: &str,
main_store: store::Main,
synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)>
{
let (automatons, query_enhancer) = generate_automatons(
reader,
query,
main_store,
synonyms_store,
)?;
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) =
generate_automatons(reader, query, main_store, synonyms_store)?;
Ok((AutomatonProducer { automatons }, query_enhancer))
}
@ -112,8 +107,7 @@ fn generate_automatons(
query: &str,
main_store: store::Main,
synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)>
{
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
@ -130,7 +124,6 @@ fn generate_automatons(
let mut original_automatons = Vec::new();
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
@ -148,29 +141,33 @@ fn generate_automatons(
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words { continue }
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
@ -178,7 +175,11 @@ fn generate_automatons(
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
enhancer_builder.declare(
query_range.clone(),
real_query_index,
&synonyms_words,
);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {

View File

@ -1,5 +1,5 @@
use std::cmp::Ordering::{Equal, Greater, Less};
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal};
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
where
S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
return false;
}
// retrieve the part to rewrite but with the length
@ -49,7 +50,9 @@ where S: AsRef<str>,
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
!original
.map(AsRef::as_ref)
.eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
@ -68,11 +71,20 @@ impl FakeIntervalTree {
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
if point < r.end {
Equal
} else {
Less
}
} else {
Greater
}
});
let n = match element { Ok(n) => n, Err(n) => n };
let n = match element {
Ok(n) => n,
Err(n) => n,
};
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
@ -91,9 +103,13 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
QueryEnhancerBuilder {
query,
origins,
real_to_origin,
}
}
/// Update the final real to origin query indices mapping.
@ -101,12 +117,12 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
where
T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
@ -126,7 +142,8 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
self.real_to_origin
.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
@ -148,10 +165,10 @@ impl QueryEnhancer {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
let (range, (origin, real_length)) = self
.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
@ -160,7 +177,10 @@ impl QueryEnhancer {
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
@ -168,15 +188,20 @@ impl QueryEnhancer {
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
}
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
}
}
}
}
@ -382,16 +407,16 @@ mod tests {
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}

View File

@ -1,6 +1,6 @@
use std::cmp::Ordering;
use crate::criterion::Criterion;
use crate::RawDocument;
use std::cmp::Ordering;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;

View File

@ -1,8 +1,8 @@
use std::cmp::Ordering;
use meilidb_schema::SchemaAttr;
use sdset::Set;
use slice_group_by::GroupBy;
use meilidb_schema::SchemaAttr;
use crate::criterion::Criterion;
use crate::RawDocument;
@ -13,8 +13,7 @@ fn number_exact_matches(
attribute: &[u16],
is_exact: &[bool],
fields_counts: &Set<(SchemaAttr, u64)>,
) -> usize
{
) -> usize {
let mut count = 0;
let mut index = 0;
@ -22,12 +21,16 @@ fn number_exact_matches(
let len = group.len();
let mut found_exact = false;
for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() {
for (pos, _) in is_exact[index..index + len]
.iter()
.filter(|x| **x)
.enumerate()
{
found_exact = true;
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value()
return usize::max_value();
}
}
}
@ -81,18 +84,18 @@ mod tests {
#[test]
fn easy_case() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
@ -108,18 +111,18 @@ mod tests {
#[test]
fn basic() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)

View File

@ -1,24 +1,20 @@
mod sum_of_typos;
mod document_id;
mod exact;
mod number_of_words;
mod words_proximity;
mod sort_by_attr;
mod sum_of_typos;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod sort_by_attr;
mod document_id;
mod words_proximity;
use std::cmp::Ordering;
use crate::RawDocument;
use std::cmp::Ordering;
pub use self::{
sum_of_typos::SumOfTypos,
number_of_words::NumberOfWords,
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
sort_by_attr::SortByAttr,
document_id::DocumentId,
};
pub trait Criterion: Send + Sync {
@ -62,17 +58,18 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> CriteriaBuilder<'a>
{
impl<'a> CriteriaBuilder<'a> {
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
CriteriaBuilder {
inner: Vec::with_capacity(capacity),
}
}
pub fn reserve(&mut self, additional: usize) {
@ -80,14 +77,16 @@ impl<'a> CriteriaBuilder<'a>
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where C: Criterion,
where
C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where C: Criterion,
where
C: Criterion,
{
self.inner.push(Box::new(criterion));
}

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {

View File

@ -2,9 +2,9 @@ use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use meilidb_schema::{Schema, SchemaAttr};
use crate::criterion::Criterion;
use crate::{RawDocument, RankedMap};
use crate::{RankedMap, RawDocument};
use meilidb_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
@ -51,8 +51,7 @@ impl<'a> SortByAttr<'a> {
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
) -> Result<SortByAttr<'a>, SortByAttrError> {
SortByAttr::new(ranked_map, schema, attr_name, false)
}
@ -60,8 +59,7 @@ impl<'a> SortByAttr<'a> {
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
) -> Result<SortByAttr<'a>, SortByAttrError> {
SortByAttr::new(ranked_map, schema, attr_name, true)
}
@ -70,8 +68,7 @@ impl<'a> SortByAttr<'a> {
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
) -> Result<SortByAttr<'a>, SortByAttrError> {
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound),
@ -81,7 +78,11 @@ impl<'a> SortByAttr<'a> {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr { ranked_map, attr, reversed })
Ok(SortByAttr {
ranked_map,
attr,
reversed,
})
}
}
@ -93,11 +94,15 @@ impl<'a> Criterion for SortByAttr<'a> {
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order }
},
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
if self.reversed {
order.reverse()
} else {
order
}
}
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
@ -122,4 +127,4 @@ impl fmt::Display for SortByAttrError {
}
}
impl Error for SortByAttrError { }
impl Error for SortByAttrError {}

View File

@ -11,10 +11,10 @@ use crate::RawDocument;
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {

View File

@ -1,7 +1,7 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::{self, Ordering};
const MAX_DISTANCE: u16 = 8;
@ -19,7 +19,9 @@ fn index_proximity(lhs: u16, rhs: u16) -> u16 {
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE }
if lattr != rattr {
return MAX_DISTANCE;
}
index_proximity(lwi, rwi)
}
@ -42,15 +44,18 @@ fn matches_proximity(
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16
{
) -> u16 {
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
let len = distance[index..index + group_len]
.linear_group()
.next()
.unwrap()
.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
@ -110,7 +115,6 @@ mod tests {
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
@ -120,19 +124,21 @@ mod tests {
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
17
);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
@ -143,13 +149,16 @@ mod tests {
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
3
);
}
}

View File

@ -1,13 +1,13 @@
use std::collections::hash_map::{HashMap, Entry};
use std::collections::hash_map::{Entry, HashMap};
use std::fs::File;
use std::path::Path;
use std::sync::{Arc, RwLock};
use std::{fs, thread};
use zlmdb::{Result as ZResult, CompactionOption};
use zlmdb::types::{Str, Unit};
use crossbeam_channel::Receiver;
use log::{debug, error};
use zlmdb::types::{Str, Unit};
use zlmdb::{CompactionOption, Result as ZResult};
use crate::{store, update, Index, MResult};
@ -32,20 +32,32 @@ fn update_awaiter(
loop {
let mut writer = match env.write_txn() {
Ok(writer) => writer,
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
Err(e) => {
error!("LMDB writer transaction begin failed: {}", e);
break;
}
};
match update::update_task(&mut writer, index.clone()) {
Ok(Some(status)) => {
if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) }
if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
}
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
}
},
}
// no more updates to handle for now
Ok(None) => { debug!("no more updates"); writer.abort(); break },
Err(e) => { error!("update task failed: {}", e); writer.abort() },
Ok(None) => {
debug!("no more updates");
writer.abort();
break;
}
Err(e) => {
error!("update task failed: {}", e);
writer.abort()
}
}
}
}
@ -76,14 +88,16 @@ impl Database {
// open the previously aggregated indexes
let mut indexes = HashMap::new();
for index_name in must_open {
let (sender, receiver) = crossbeam_channel::bounded(100);
let index = match store::open(&env, &index_name, sender.clone())? {
Some(index) => index,
None => {
log::warn!("the index {} doesn't exist or has not all the databases", index_name);
log::warn!(
"the index {} doesn't exist or has not all the databases",
index_name
);
continue;
},
}
};
let update_fn = Arc::new(ArcSwapFn::empty());
@ -100,10 +114,18 @@ impl Database {
sender.send(()).unwrap();
let result = indexes.insert(index_name, (index, update_fn, handle));
assert!(result.is_none(), "The index should not have been already open");
assert!(
result.is_none(),
"The index should not have been already open"
);
}
Ok(Database { env, common_store, indexes_store, indexes: RwLock::new(indexes) })
Ok(Database {
env,
common_store,
indexes_store,
indexes: RwLock::new(indexes),
})
}
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
@ -152,7 +174,7 @@ impl Database {
let update_fn = Some(Arc::new(update_fn));
current_update_fn.swap(update_fn);
true
},
}
None => false,
}
}
@ -160,7 +182,10 @@ impl Database {
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => { current_update_fn.swap(None); true },
Some((_, current_update_fn, _)) => {
current_update_fn.swap(None);
true
}
None => false,
}
}

View File

@ -1,5 +1,5 @@
use std::hash::Hash;
use hashbrown::HashMap;
use std::hash::Hash;
pub struct DistinctMap<K> {
inner: HashMap<K, usize>,

View File

@ -1,6 +1,6 @@
use std::{error, fmt, io};
use crate::serde::{DeserializerError, SerializerError};
use serde_json::Error as SerdeJsonError;
use crate::serde::{SerializerError, DeserializerError};
use std::{error, fmt, io};
pub type MResult<T> = Result<T, Error>;
@ -90,7 +90,7 @@ impl fmt::Display for Error {
}
}
impl error::Error for Error { }
impl error::Error for Error {}
#[derive(Debug)]
pub enum UnsupportedOperation {

View File

@ -1,7 +1,9 @@
#[cfg(test)]
#[macro_use] extern crate assert_matches;
#[macro_use]
extern crate assert_matches;
mod automaton;
pub mod criterion;
mod database;
mod distinct_map;
mod error;
@ -9,31 +11,41 @@ mod number;
mod query_builder;
mod ranked_map;
mod raw_document;
mod reordered_attrs;
mod update;
pub mod criterion;
pub mod raw_indexer;
mod reordered_attrs;
pub mod serde;
pub mod store;
mod update;
pub use self::database::{Database, BoxUpdateFn};
pub use self::database::{BoxUpdateFn, Database};
pub use self::error::{Error, MResult};
pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{UpdateStatus, UpdateResult, UpdateType};
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};
use ::serde::{Serialize, Deserialize};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize)]
#[derive(AsBytes, FromBytes)]
#[derive(
Debug,
Copy,
Clone,
Eq,
PartialEq,
PartialOrd,
Ord,
Hash,
Serialize,
Deserialize,
AsBytes,
FromBytes,
)]
#[repr(C)]
pub struct DocumentId(pub u64);
@ -42,8 +54,7 @@ pub struct DocumentId(pub u64);
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(AsBytes, FromBytes)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
@ -109,7 +120,10 @@ pub struct Document {
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document { id: raw.id, highlights: raw.highlights }
Document {
id: raw.id,
highlights: raw.highlights,
}
}
#[cfg(test)]
@ -134,7 +148,11 @@ impl Document {
matches.push(match_);
}
Document { id: raw.id, matches, highlights: raw.highlights }
Document {
id: raw.id,
matches,
highlights: raw.highlights,
}
}
}

View File

@ -1,12 +1,11 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
@ -32,7 +31,11 @@ impl FromStr for Number {
Err(error) => error,
};
Err(ParseNumberError { uint_error, int_error, float_error })
Err(ParseNumberError {
uint_error,
int_error,
float_error,
})
}
}
@ -46,10 +49,17 @@ pub struct ParseNumberError {
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
write!(
f,
"can not parse number: {}, {}",
self.uint_error, self.float_error
)
} else {
write!(f, "can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error)
write!(
f,
"can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error
)
}
}
}

View File

@ -2,17 +2,17 @@ use hashbrown::HashMap;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Instant, Duration};
use std::time::{Duration, Instant};
use fst::{IntoStreamer, Streamer};
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::raw_document::{RawDocument, raw_documents_from};
use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria};
use crate::{store, MResult, reordered_attrs::ReorderedAttrs};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
pub struct QueryBuilder<'c, 'f, 'd> {
criteria: Criteria<'c>,
@ -29,8 +29,7 @@ pub struct QueryBuilder<'c, 'f, 'd> {
fn multiword_rewrite_matches(
mut matches: Vec<(DocumentId, TmpMatch)>,
query_enhancer: &QueryEnhancer,
) -> SetBuf<(DocumentId, TmpMatch)>
{
) -> SetBuf<(DocumentId, TmpMatch)> {
let mut padded_matches = Vec::with_capacity(matches.len());
// we sort the matches by word index to make them rewritable
@ -38,7 +37,6 @@ fn multiword_rewrite_matches(
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
@ -47,18 +45,20 @@ fn multiword_rewrite_matches(
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for (id, match_) in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
let match_ = TmpMatch {
query_index,
word_index,
..match_.clone()
};
padded_matches.push((*id, match_));
}
@ -67,22 +67,30 @@ fn multiword_rewrite_matches(
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = TmpMatch { query_index, word_index, ..match_.clone() };
let padmatch = TmpMatch {
query_index,
word_index,
..match_.clone()
};
for (_, nmatch_) in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index);
let query_index = rep.next().unwrap();
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index =
match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..match_.clone()
};
padded_matches.push((*id, match_));
biggest = biggest.max(i + 1);
}
@ -97,7 +105,7 @@ fn multiword_rewrite_matches(
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break
break;
}
if !found {
@ -105,7 +113,11 @@ fn multiword_rewrite_matches(
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
let match_ = TmpMatch {
query_index,
word_index,
..match_.clone()
};
padded_matches.push((*id, match_));
}
@ -132,13 +144,17 @@ fn fetch_raw_documents(
main_store: &store::Main,
postings_lists_store: &store::PostingsLists,
documents_fields_counts_store: &store::DocumentsFieldsCounts,
) -> MResult<Vec<RawDocument>>
{
) -> MResult<Vec<RawDocument>> {
let mut matches = Vec::new();
let mut highlights = Vec::new();
for automaton in automatons {
let Automaton { index, is_exact, query_len, .. } = automaton;
let Automaton {
index,
is_exact,
query_len,
..
} = automaton;
let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? {
@ -210,8 +226,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
postings_lists: store::PostingsLists,
documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms,
) -> QueryBuilder<'c, 'f, 'd>
{
) -> QueryBuilder<'c, 'f, 'd> {
QueryBuilder::with_criteria(
main,
postings_lists,
@ -227,8 +242,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd>
{
) -> QueryBuilder<'c, 'f, 'd> {
QueryBuilder {
criteria,
searchable_attrs: None,
@ -245,7 +259,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn with_filter<F>(&mut self, function: F)
where F: Fn(DocumentId) -> bool + 'f,
where
F: Fn(DocumentId) -> bool + 'f,
{
self.filter = Some(Box::new(function))
}
@ -255,13 +270,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
}
pub fn with_distinct<F, K>(&mut self, function: F, size: usize)
where F: Fn(DocumentId) -> Option<u64> + 'd,
where
F: Fn(DocumentId) -> Option<u64> + 'd,
{
self.distinct = Some((Box::new(function), size))
}
pub fn add_searchable_attribute(&mut self, attribute: u16) {
let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new);
let reorders = self
.searchable_attrs
.get_or_insert_with(ReorderedAttrs::new);
reorders.insert_attribute(attribute);
}
@ -270,41 +288,36 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
reader: &zlmdb::RoTxn,
query: &str,
range: Range<usize>,
) -> MResult<Vec<Document>>
{
) -> MResult<Vec<Document>> {
match self.distinct {
Some((distinct, distinct_size)) => {
raw_query_with_distinct(
reader,
query,
range,
self.filter,
distinct,
distinct_size,
self.timeout,
self.criteria,
self.searchable_attrs,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
self.synonyms_store,
)
},
None => {
raw_query(
reader,
query,
range,
self.filter,
self.timeout,
self.criteria,
self.searchable_attrs,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
self.synonyms_store,
)
}
Some((distinct, distinct_size)) => raw_query_with_distinct(
reader,
query,
range,
self.filter,
distinct,
distinct_size,
self.timeout,
self.criteria,
self.searchable_attrs,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
self.synonyms_store,
),
None => raw_query(
reader,
query,
range,
self.filter,
self.timeout,
self.criteria,
self.searchable_attrs,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
self.synonyms_store,
),
}
}
}
@ -326,7 +339,8 @@ fn raw_query<'c, FI>(
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where FI: Fn(DocumentId) -> bool,
where
FI: Fn(DocumentId) -> bool,
{
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
@ -347,18 +361,14 @@ where FI: Fn(DocumentId) -> bool,
postings_lists_store,
documents_fields_counts_store,
synonyms_store,
)
);
}
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
synonyms_store,
)?;
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let mut automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
@ -382,7 +392,7 @@ where FI: Fn(DocumentId) -> bool,
// stop processing when time is running out
if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break
break;
}
}
@ -409,20 +419,27 @@ where FI: Fn(DocumentId) -> bool,
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end { continue 'criteria }
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
// once we classified the documents related to the current
// automatons we save that as the next valid result
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = raw_documents
.into_iter()
.skip(range.start)
.take(range.len());
raw_documents_processed.clear();
raw_documents_processed.extend(iter);
// stop processing when time is running out
if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout { break }
if start_processing.elapsed() > timeout {
break;
}
}
}
@ -456,18 +473,15 @@ fn raw_query_with_distinct<'c, FI, FD>(
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
where
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
synonyms_store,
)?;
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let mut automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
@ -491,7 +505,7 @@ where FI: Fn(DocumentId) -> bool,
// stop processing when time is running out
if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break
break;
}
}
@ -528,7 +542,7 @@ where FI: Fn(DocumentId) -> bool,
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
},
}
None => true,
};
@ -543,7 +557,9 @@ where FI: Fn(DocumentId) -> bool,
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break }
if buf_distinct.len() >= range.end {
break;
}
}
documents_seen += group.len();
@ -558,7 +574,9 @@ where FI: Fn(DocumentId) -> bool,
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end { continue 'criteria }
if buf_distinct.len() >= range.end {
continue 'criteria;
}
}
}
}
@ -583,14 +601,18 @@ where FI: Fn(DocumentId) -> bool,
if distinct_accepted && seen.len() > range.start {
raw_documents_processed.push(document);
if raw_documents_processed.len() == range.len() { break }
if raw_documents_processed.len() == range.len() {
break;
}
}
}
}
// stop processing when time is running out
if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout { break }
if start_processing.elapsed() > timeout {
break;
}
}
}
@ -611,20 +633,20 @@ mod tests {
use std::collections::{BTreeSet, HashMap};
use std::iter::FromIterator;
use fst::{Set, IntoStreamer};
use fst::{IntoStreamer, Set};
use meilidb_schema::SchemaAttr;
use sdset::SetBuf;
use tempfile::TempDir;
use meilidb_schema::SchemaAttr;
use crate::automaton::normalize_str;
use crate::database::Database;
use crate::DocIndex;
use crate::store::Index;
use crate::DocIndex;
fn set_from_stream<'f, I, S>(stream: I) -> Set
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>,
S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>,
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
{
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(stream).unwrap();
@ -687,14 +709,23 @@ mod tests {
let word = word.to_lowercase();
let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() {
let alternatives = match self
.index
.synonyms
.synonyms(&writer, word.as_bytes())
.unwrap()
{
Some(alternatives) => alternatives,
None => fst::Set::default(),
};
let new = sdset_into_fstset(&new);
let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union());
self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap();
let new_alternatives =
set_from_stream(alternatives.op().add(new.into_stream()).r#union());
self.index
.synonyms
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
.unwrap();
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
Some(synonyms) => synonyms,
@ -702,14 +733,17 @@ mod tests {
};
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap();
self.index
.main
.put_synonyms_fst(&mut writer, &synonyms_fst)
.unwrap();
writer.commit().unwrap();
}
}
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase {
fn from_iter<I: IntoIterator<Item=(&'a str, &'a [DocIndex])>>(iter: I) -> Self {
fn from_iter<I: IntoIterator<Item = (&'a str, &'a [DocIndex])>>(iter: I) -> Self {
let tempdir = TempDir::new().unwrap();
let database = Database::open_or_create(&tempdir).unwrap();
let index = database.create_index("default").unwrap();
@ -724,7 +758,10 @@ mod tests {
for (word, indexes) in iter {
let word = word.to_lowercase().into_bytes();
words_fst.insert(word.clone());
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
postings_lists
.entry(word)
.or_insert_with(Vec::new)
.extend_from_slice(indexes);
for idx in indexes {
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
}
@ -736,31 +773,33 @@ mod tests {
for (word, postings_list) in postings_lists {
let postings_list = SetBuf::from_dirty(postings_list);
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
index
.postings_lists
.put_postings_list(&mut writer, &word, &postings_list)
.unwrap();
}
for ((docid, attr, _), count) in fields_counts {
let prev = index.documents_fields_counts
.document_field_count(
&mut writer,
docid,
SchemaAttr(attr),
).unwrap();
let prev = index
.documents_fields_counts
.document_field_count(&mut writer, docid, SchemaAttr(attr))
.unwrap();
let prev = prev.unwrap_or(0);
index.documents_fields_counts
.put_document_field_count(
&mut writer,
docid,
SchemaAttr(attr),
prev + count,
).unwrap();
index
.documents_fields_counts
.put_document_field_count(&mut writer, docid, SchemaAttr(attr), prev + count)
.unwrap();
}
writer.commit().unwrap();
TempDatabase { database, index, _tempdir: tempdir }
TempDatabase {
database,
index,
_tempdir: tempdir,
}
}
}
@ -768,8 +807,8 @@ mod tests {
fn simple() {
let store = TempDatabase::from_iter(vec![
("iphone", &[doc_char_index(0, 0, 0)][..]),
("from", &[doc_char_index(0, 1, 1)][..]),
("apple", &[doc_char_index(0, 2, 2)][..]),
("from", &[doc_char_index(0, 1, 1)][..]),
("apple", &[doc_char_index(0, 2, 2)][..]),
]);
let env = &store.database.env;
@ -791,9 +830,7 @@ mod tests {
#[test]
fn simple_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("hello", &[doc_index(0, 0)][..]),
]);
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
@ -825,9 +862,7 @@ mod tests {
#[test]
fn prefix_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("hello", &[doc_index(0, 0)][..]),
]);
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
@ -872,9 +907,7 @@ mod tests {
#[test]
fn levenshtein_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("hello", &[doc_index(0, 0)][..]),
]);
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
@ -907,9 +940,9 @@ mod tests {
#[test]
fn harder_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("hello", &[doc_index(0, 0)][..]),
("hello", &[doc_index(0, 0)][..]),
("bonjour", &[doc_index(1, 3)]),
("salut", &[doc_index(2, 5)]),
("salut", &[doc_index(2, 5)]),
]);
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
@ -987,17 +1020,22 @@ mod tests {
/// Unique word has multi-word synonyms
fn unique_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("subway", &[doc_char_index(0, 3, 3)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("subway", &[doc_char_index(1, 1, 1)][..]),
]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
store.add_synonym(
"NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
@ -1056,20 +1094,18 @@ mod tests {
#[test]
fn unique_to_multiword_synonyms_words_proximity() {
let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("subway", &[doc_char_index(0, 3, 3)][..]),
("york", &[doc_char_index(1, 0, 0)][..]),
("new", &[doc_char_index(1, 1, 1)][..]),
("york", &[doc_char_index(1, 0, 0)][..]),
("new", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
("NY", &[doc_char_index(2, 0, 0)][..]),
("NY", &[doc_char_index(2, 0, 0)][..]),
("subway", &[doc_char_index(2, 1, 1)][..]),
]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
let env = &store.database.env;
let reader = env.read_txn().unwrap();
@ -1120,11 +1156,10 @@ mod tests {
#[test]
fn unique_to_multiword_synonyms_cumulative_word_index() {
let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_char_index(0, 0, 0)][..]),
("NY", &[doc_char_index(0, 0, 0)][..]),
("subway", &[doc_char_index(0, 1, 1)][..]),
("new", &[doc_char_index(1, 0, 0)][..]),
("york", &[doc_char_index(1, 1, 1)][..]),
("new", &[doc_char_index(1, 0, 0)][..]),
("york", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
]);
@ -1175,20 +1210,25 @@ mod tests {
/// Unique word has multi-word synonyms
fn harder_unique_to_multiword_synonyms_one() {
let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]),
("subway", &[doc_char_index(0, 4, 4)][..]),
("broken", &[doc_char_index(0, 5, 5)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]),
("subway", &[doc_char_index(0, 4, 4)][..]),
("broken", &[doc_char_index(0, 5, 5)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
store.add_synonym(
"NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
@ -1249,21 +1289,26 @@ mod tests {
/// Unique word has multi-word synonyms
fn even_harder_unique_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]),
("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]),
("underground", &[doc_char_index(0, 4, 4)][..]),
("train", &[doc_char_index(0, 5, 5)][..]),
("broken", &[doc_char_index(0, 6, 6)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
("train", &[doc_char_index(0, 5, 5)][..]),
("broken", &[doc_char_index(0, 6, 6)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
store.add_synonym(
"NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
let env = &store.database.env;
@ -1330,30 +1375,36 @@ mod tests {
/// Multi-word has multi-word synonyms
fn multiword_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_char_index(0, 0, 0)][..]),
("subway", &[doc_char_index(0, 1, 1)][..]),
("NYC", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
("broken", &[doc_char_index(1, 3, 3)][..]),
("new", &[doc_char_index(2, 0, 0)][..]),
("york", &[doc_char_index(2, 1, 1)][..]),
("NY", &[doc_char_index(0, 0, 0)][..]),
("subway", &[doc_char_index(0, 1, 1)][..]),
("NYC", &[doc_char_index(1, 0, 0)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
("broken", &[doc_char_index(1, 3, 3)][..]),
("new", &[doc_char_index(2, 0, 0)][..]),
("york", &[doc_char_index(2, 1, 1)][..]),
("underground", &[doc_char_index(2, 2, 2)][..]),
("train", &[doc_char_index(2, 3, 3)][..]),
("broken", &[doc_char_index(2, 4, 4)][..]),
("train", &[doc_char_index(2, 3, 3)][..]),
("broken", &[doc_char_index(2, 4, 4)][..]),
]);
store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ]));
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ]));
store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ]));
store.add_synonym(
"new york",
SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]),
);
store.add_synonym(
"new york city",
SetBuf::from_dirty(vec!["NYC", "NY", "new york"]),
);
store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"]));
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap();
let results = builder
.query(&reader, "new york underground train broken", 0..20)
.unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
@ -1390,7 +1441,9 @@ mod tests {
assert_matches!(iter.next(), None);
let builder = store.query_builder();
let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap();
let results = builder
.query(&reader, "new york city underground train broken", 0..20)
.unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
@ -1436,14 +1489,14 @@ mod tests {
#[test]
fn intercrossed_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![
("new", &[doc_index(0, 0)][..]),
("york", &[doc_index(0, 1)][..]),
("big", &[doc_index(0, 2)][..]),
("city", &[doc_index(0, 3)][..]),
("new", &[doc_index(0, 0)][..]),
("york", &[doc_index(0, 1)][..]),
("big", &[doc_index(0, 2)][..]),
("city", &[doc_index(0, 3)][..]),
]);
store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ]));
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ]));
store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"]));
store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"]));
let env = &store.database.env;
let reader = env.read_txn().unwrap();
@ -1469,16 +1522,14 @@ mod tests {
assert_matches!(iter.next(), None);
let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_index(0, 0)][..]),
("city", &[doc_index(0, 1)][..]),
("NY", &[doc_index(0, 0)][..]),
("city", &[doc_index(0, 1)][..]),
("subway", &[doc_index(0, 2)][..]),
("NY", &[doc_index(1, 0)][..]),
("NY", &[doc_index(1, 0)][..]),
("subway", &[doc_index(1, 1)][..]),
("NY", &[doc_index(2, 0)][..]),
("york", &[doc_index(2, 1)][..]),
("city", &[doc_index(2, 2)][..]),
("NY", &[doc_index(2, 0)][..]),
("york", &[doc_index(2, 1)][..]),
("city", &[doc_index(2, 2)][..]),
("subway", &[doc_index(2, 3)][..]),
]);
@ -1525,20 +1576,22 @@ mod tests {
#[test]
fn cumulative_word_indices() {
let mut store = TempDatabase::from_iter(vec![
("NYC", &[doc_index(0, 0)][..]),
("long", &[doc_index(0, 1)][..]),
("NYC", &[doc_index(0, 0)][..]),
("long", &[doc_index(0, 1)][..]),
("subway", &[doc_index(0, 2)][..]),
("cool", &[doc_index(0, 3)][..]),
("cool", &[doc_index(0, 3)][..]),
]);
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"]));
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap();
let results = builder
.query(&reader, "new york city long subway cool ", 0..20)
.unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
@ -1560,8 +1613,7 @@ mod tests {
let mut store = TempDatabase::from_iter(vec![
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
("iphone", &[doc_index(1, 0)][..]),
("iphone", &[doc_index(1, 0)][..]),
]);
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"]));
@ -1624,8 +1676,8 @@ mod tests {
#[test]
fn simple_concatenation() {
let store = TempDatabase::from_iter(vec![
("iphone", &[doc_index(0, 0)][..]),
("case", &[doc_index(0, 1)][..]),
("iphone", &[doc_index(0, 0)][..]),
("case", &[doc_index(0, 1)][..]),
]);
let env = &store.database.env;

View File

@ -2,12 +2,11 @@ use std::io::{Read, Write};
use hashbrown::HashMap;
use meilidb_schema::SchemaAttr;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::{DocumentId, Number};
#[derive(Debug, Default, Clone, PartialEq, Eq)]
#[derive(Serialize, Deserialize)]
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(transparent)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);

View File

@ -1,11 +1,11 @@
use std::sync::Arc;
use std::fmt;
use std::sync::Arc;
use meilidb_schema::SchemaAttr;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight};
use crate::{DocumentId, Highlight, TmpMatch};
#[derive(Clone)]
pub struct RawDocument {
@ -20,7 +20,13 @@ impl RawDocument {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
unsafe {
&self
.matches
.matches
.query_index
.get_unchecked(r.start..r.end)
}
}
pub fn distance(&self) -> &[u8] {
@ -41,7 +47,13 @@ impl RawDocument {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
unsafe {
&self
.matches
.matches
.word_index
.get_unchecked(r.start..r.end)
}
}
pub fn is_exact(&self) -> &[bool] {
@ -55,12 +67,32 @@ impl RawDocument {
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"query_index",
self.query_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"distance",
self.distance()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"attribute",
self.attribute()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"word_index",
self.word_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"is_exact",
self.is_exact()
))?;
f.write_str("}")?;
Ok(())
}
@ -70,8 +102,7 @@ pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
) -> Vec<RawDocument>
{
) -> Vec<RawDocument> {
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
@ -94,10 +125,21 @@ pub fn raw_documents_from(
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument { id, matches, highlights, fields_counts }
}).collect()
docs_ranges
.into_iter()
.map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches {
range,
matches: matches.clone(),
};
RawDocument {
id,
matches,
highlights,
fields_counts,
}
})
.collect()
}
#[derive(Debug, Copy, Clone)]

View File

@ -1,10 +1,10 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use crate::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
@ -60,7 +60,9 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
number_of_words += 1;
}
@ -70,8 +72,9 @@ impl RawIndexer {
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
IT: Iterator<Item = &'a str> + Clone,
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
@ -88,14 +91,25 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
}
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
if lowercase_text.contains(is_cjk) { return lowercase_text }
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
}).collect();
let deunicoded: Vec<_> = lowercased
.into_iter()
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
@ -108,17 +122,21 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self.docs_words
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
@ -127,7 +145,10 @@ impl RawIndexer {
})
.collect();
Indexed { words_doc_indexes, docs_words }
Indexed {
words_doc_indexes,
docs_words,
}
}
}
@ -138,16 +159,20 @@ fn index_token(
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token.word_index >= word_limit { return false }
) -> bool {
if token.word_index >= word_limit {
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
},
}
None => return false,
}
@ -183,7 +208,9 @@ mod tests {
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -191,7 +218,9 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -203,7 +232,9 @@ mod tests {
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -211,6 +242,8 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
}

View File

@ -6,7 +6,10 @@ pub struct ReorderedAttrs {
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { count: 0, reorders: Vec::new() }
ReorderedAttrs {
count: 0,
reorders: Vec::new(),
}
}
pub fn insert_attribute(&mut self, attribute: u16) {

View File

@ -77,13 +77,18 @@ impl ser::Serializer for ConvertToNumber {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "Option" })
Err(SerializerError::UnrankableType {
type_name: "Option",
})
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "Option" })
Err(SerializerError::UnrankableType {
type_name: "Option",
})
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -91,25 +96,29 @@ impl ser::Serializer for ConvertToNumber {
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "unit struct" })
Err(SerializerError::UnrankableType {
type_name: "unit struct",
})
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "unit variant" })
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType {
type_name: "unit variant",
})
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
value.serialize(self)
}
@ -119,15 +128,20 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
_value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
Err(SerializerError::UnrankableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "sequence" })
Err(SerializerError::UnrankableType {
type_name: "sequence",
})
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -137,10 +151,11 @@ impl ser::Serializer for ConvertToNumber {
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnrankableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
@ -148,10 +163,11 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnrankableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -161,10 +177,11 @@ impl ser::Serializer for ConvertToNumber {
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct" })
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnrankableType {
type_name: "struct",
})
}
fn serialize_struct_variant(
@ -172,9 +189,10 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct variant" })
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnrankableType {
type_name: "struct variant",
})
}
}

View File

@ -1,5 +1,5 @@
use serde::Serialize;
use serde::ser;
use serde::Serialize;
use super::SerializerError;
@ -17,7 +17,9 @@ impl ser::Serializer for ConvertToString {
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "boolean" })
Err(SerializerError::UnserializableType {
type_name: "boolean",
})
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
@ -73,13 +75,18 @@ impl ser::Serializer for ConvertToString {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -87,25 +94,29 @@ impl ser::Serializer for ConvertToString {
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "unit variant",
})
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
value.serialize(self)
}
@ -115,15 +126,20 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
_value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
Err(SerializerError::UnserializableType {
type_name: "sequence",
})
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -133,10 +149,11 @@ impl ser::Serializer for ConvertToString {
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
@ -144,10 +161,11 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -157,10 +175,11 @@ impl ser::Serializer for ConvertToString {
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct" })
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct",
})
}
fn serialize_struct_variant(
@ -168,9 +187,10 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct variant",
})
}
}

View File

@ -1,12 +1,12 @@
use std::collections::HashSet;
use std::io::Cursor;
use std::{fmt, error::Error};
use std::{error::Error, fmt};
use meilidb_schema::{Schema, SchemaAttr};
use serde_json::Error as SerdeJsonError;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde::{de, forward_to_deserialize_any};
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::Error as SerdeJsonError;
use crate::store::DocumentsFields;
use crate::DocumentId;
@ -60,7 +60,8 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
}
@ -72,16 +73,21 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
where
V: de::Visitor<'de>,
{
let mut error = None;
let iter = self.documents_fields
let iter = self
.documents_fields
.document_fields(self.reader, self.document_id)?
.filter_map(|result| {
let (attr, value) = match result {
Ok(value) => value,
Err(e) => { error = Some(e); return None },
Err(e) => {
error = Some(e);
return None;
}
};
let is_displayed = self.schema.props(attr).is_displayed();
@ -99,7 +105,9 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
let result = visitor
.visit_map(map_deserializer)
.map_err(DeserializerError::from);
match error.take() {
Some(error) => Err(error.into()),
@ -122,7 +130,8 @@ impl<'de> de::Deserializer<'de> for Value {
type Error = SerdeJsonError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
where
V: de::Visitor<'de>,
{
self.0.deserialize_any(visitor)
}

View File

@ -5,13 +5,14 @@ use serde::{ser, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
use super::{SerializerError, ConvertToString};
use super::{ConvertToString, SerializerError};
pub fn extract_document_id<D>(
identifier: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize,
where
D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
document.serialize(serializer)
@ -77,13 +78,18 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -91,25 +97,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "unit variant",
})
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
value.serialize(self)
}
@ -119,15 +129,20 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
_value: &T,
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
where
T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
Err(SerializerError::UnserializableType {
type_name: "sequence",
})
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -137,10 +152,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
@ -148,10 +164,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -167,9 +184,8 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier,
document_id: None,
@ -183,10 +199,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct variant",
})
}
}
@ -201,7 +218,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
where
T: Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
@ -209,7 +227,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
where
T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
@ -218,9 +237,11 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
value: &V,
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
where
K: Serialize,
V: Serialize,
{
let key = key.serialize(ConvertToString)?;
@ -252,9 +273,10 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
value: &T,
) -> Result<(), Self::Error>
where T: Serialize,
where
T: Serialize,
{
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;

View File

@ -2,9 +2,9 @@ use meilidb_schema::SchemaAttr;
use serde::ser;
use serde::Serialize;
use crate::DocumentId;
use super::{ConvertToString, SerializerError};
use crate::raw_indexer::RawIndexer;
use super::{SerializerError, ConvertToString};
use crate::DocumentId;
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
@ -24,7 +24,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" })
Err(SerializerError::UnindexableType {
type_name: "boolean",
})
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
@ -83,7 +85,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, text);
Ok(Some(number_of_words))
}
@ -92,14 +96,19 @@ impl<'a> ser::Serializer for Indexer<'a> {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" })
Err(SerializerError::UnindexableType {
type_name: "Option",
})
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, &text);
Ok(Some(number_of_words))
}
@ -108,25 +117,29 @@ impl<'a> ser::Serializer for Indexer<'a> {
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" })
Err(SerializerError::UnindexableType {
type_name: "unit struct",
})
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "unit variant" })
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "unit variant",
})
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
value: &T,
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
value.serialize(self)
}
@ -136,11 +149,14 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
_value: &T,
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
Err(SerializerError::UnindexableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
@ -168,10 +184,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
@ -179,10 +196,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -199,10 +217,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct" })
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "struct",
})
}
fn serialize_struct_variant(
@ -210,10 +229,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct variant" })
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "struct variant",
})
}
}
@ -229,7 +249,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
@ -238,7 +259,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None)
}
}
@ -255,7 +277,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
@ -263,7 +286,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
@ -272,7 +296,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None)
}
}
@ -293,7 +318,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
@ -304,7 +330,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None)
}
}
@ -321,7 +348,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize
where
T: Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
@ -330,7 +358,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None)
}
}

View File

@ -15,19 +15,19 @@ mod extract_document_id;
mod indexer;
mod serializer;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber;
pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
use std::collections::BTreeMap;
use std::{fmt, error::Error};
use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr;
use serde_json::Error as SerdeJsonError;
use serde::ser;
use serde_json::Error as SerdeJsonError;
use crate::{DocumentId, ParseNumberError};
@ -55,24 +55,24 @@ impl fmt::Display for SerializerError {
match self {
SerializerError::DocumentIdNotFound => {
f.write_str("serialized document does not have an id according to the schema")
},
}
SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number")
},
}
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e)
},
}
SerializerError::UnserializableType { type_name } => {
write!(f, "{} is not a serializable type", type_name)
},
}
SerializerError::UnindexableType { type_name } => {
write!(f, "{} is not an indexable type", type_name)
},
}
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
},
}
SerializerError::Custom(s) => f.write_str(s),
}
}

View File

@ -1,12 +1,12 @@
use std::collections::HashMap;
use meilidb_schema::{Schema, SchemaAttr};
use serde::ser;
use std::collections::HashMap;
use crate::{DocumentId, RankedMap};
use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore;
use crate::{DocumentId, RankedMap};
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> {
pub schema: &'a Schema,
@ -55,13 +55,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
Err(SerializerError::UnserializableType {
type_name: "Option",
})
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -69,25 +74,29 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "unit variant",
})
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
value: &T,
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
value.serialize(self)
}
@ -97,15 +106,20 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
_value: &T,
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
Err(SerializerError::UnserializableType {
type_name: "sequence",
})
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -115,10 +129,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
@ -126,10 +141,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -147,9 +163,8 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
@ -165,10 +180,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct variant",
})
}
}
@ -187,7 +203,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
@ -195,7 +212,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
@ -206,7 +224,9 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize,
where
K: ser::Serialize,
V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
@ -245,7 +265,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
where
T: ser::Serialize,
{
serialize_value(
self.schema,
@ -274,7 +295,8 @@ fn serialize_value<T: ?Sized>(
key: &str,
value: &T,
) -> Result<(), SerializerError>
where T: ser::Serialize,
where
T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
@ -283,7 +305,11 @@ where T: ser::Serialize,
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer { attribute, indexer, document_id };
let indexer = Indexer {
attribute,
indexer,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
}

View File

@ -1,8 +1,8 @@
use std::sync::Arc;
use zlmdb::types::{OwnedType, ByteSlice};
use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::BEU64;
use crate::DocumentId;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct DocsWords {
@ -15,8 +15,7 @@ impl DocsWords {
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
words: &fst::Set,
) -> ZResult<()>
{
) -> ZResult<()> {
let document_id = BEU64::new(document_id.0);
let bytes = words.as_fst().as_bytes();
self.docs_words.put(writer, &document_id, bytes)
@ -26,8 +25,7 @@ impl DocsWords {
&self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<bool>
{
) -> ZResult<bool> {
let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id)
}
@ -36,8 +34,7 @@ impl DocsWords {
&self,
reader: &zlmdb::RoTxn,
document_id: DocumentId,
) -> ZResult<Option<fst::Set>>
{
) -> ZResult<Option<fst::Set>> {
let document_id = BEU64::new(document_id.0);
match self.docs_words.get(reader, &document_id)? {
Some(bytes) => {
@ -45,7 +42,7 @@ impl DocsWords {
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
},
}
None => Ok(None),
}
}

View File

@ -1,9 +1,9 @@
use meilidb_schema::SchemaAttr;
use zlmdb::types::{OwnedType, ByteSlice};
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::DocumentAttrKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
@ -17,8 +17,7 @@ impl DocumentsFields {
document_id: DocumentId,
attribute: SchemaAttr,
value: &[u8],
) -> ZResult<()>
{
) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.put(writer, &key, value)
}
@ -27,8 +26,7 @@ impl DocumentsFields {
&self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<usize>
{
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields.delete_range(writer, start..=end)
@ -39,8 +37,7 @@ impl DocumentsFields {
reader: &'txn zlmdb::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>>
{
) -> ZResult<Option<&'txn [u8]>> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.get(reader, &key)
}
@ -49,8 +46,7 @@ impl DocumentsFields {
&self,
reader: &'txn zlmdb::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>>
{
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields.range(reader, start..=end)?;
@ -70,7 +66,7 @@ impl<'txn> Iterator for DocumentFieldsIter<'txn> {
Some(Ok((key, bytes))) => {
let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, bytes)))
},
}
Some(Err(e)) => Some(Err(e.into())),
None => None,
}

View File

@ -1,8 +1,8 @@
use super::DocumentAttrKey;
use crate::DocumentId;
use meilidb_schema::SchemaAttr;
use zlmdb::types::OwnedType;
use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::DocumentAttrKey;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
@ -16,8 +16,7 @@ impl DocumentsFieldsCounts {
document_id: DocumentId,
attribute: SchemaAttr,
value: u64,
) -> ZResult<()>
{
) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value)
}
@ -26,11 +25,11 @@ impl DocumentsFieldsCounts {
&self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<usize>
{
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields_counts.delete_range(writer, start..=end)
self.documents_fields_counts
.delete_range(writer, start..=end)
}
pub fn document_field_count(
@ -38,8 +37,7 @@ impl DocumentsFieldsCounts {
reader: &zlmdb::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<u64>>
{
) -> ZResult<Option<u64>> {
let key = DocumentAttrKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)),
@ -51,8 +49,7 @@ impl DocumentsFieldsCounts {
&self,
reader: &'txn zlmdb::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>>
{
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields_counts.range(reader, start..=end)?;
@ -62,17 +59,18 @@ impl DocumentsFieldsCounts {
pub fn documents_ids<'txn>(
&self,
reader: &'txn zlmdb::RoTxn,
) -> ZResult<DocumentsIdsIter<'txn>>
{
) -> ZResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter { last_seen_id: None, iter })
Ok(DocumentsIdsIter {
last_seen_id: None,
iter,
})
}
pub fn all_documents_fields_counts<'txn>(
&self,
reader: &'txn zlmdb::RoTxn,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>>
{
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter })
}
@ -90,7 +88,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
Some(Ok((key, count))) => {
let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, count)))
},
}
Some(Err(e)) => Some(Err(e.into())),
None => None,
}
@ -112,9 +110,9 @@ impl Iterator for DocumentsIdsIter<'_> {
let document_id = DocumentId(key.docid.get());
if Some(document_id) != self.last_seen_id {
self.last_seen_id = Some(document_id);
return Some(Ok(document_id))
return Some(Ok(document_id));
}
},
}
Err(e) => return Some(Err(e.into())),
}
}
@ -135,7 +133,7 @@ impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
let docid = DocumentId(key.docid.get());
let attr = SchemaAttr(key.attr.get());
Some(Ok((docid, attr, count)))
},
}
Some(Err(e)) => Some(Err(e.into())),
None => None,
}

View File

@ -1,15 +1,15 @@
use std::sync::Arc;
use meilidb_schema::Schema;
use zlmdb::types::{Str, OwnedType, ByteSlice, Serde};
use zlmdb::Result as ZResult;
use crate::RankedMap;
use meilidb_schema::Schema;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
use zlmdb::Result as ZResult;
const CUSTOMS_KEY: &str = "customs-key";
const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms";
const WORDS_KEY: &str = "words";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms";
const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)]
pub struct Main {
@ -29,13 +29,14 @@ impl Main {
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
},
}
None => Ok(None),
}
}
pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
self.main.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
self.main
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
}
pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
@ -43,11 +44,13 @@ impl Main {
}
pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
self.main
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
}
pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
self.main.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
self.main
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
@ -62,28 +65,34 @@ impl Main {
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
},
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
where F: Fn(u64) -> u64,
where
F: Fn(u64) -> u64,
{
let new = self.number_of_documents(writer).map(f)?;
self.main.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
self.main
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
Ok(new)
}
pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
match self.main.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? {
match self
.main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
{
Some(value) => Ok(value),
None => Ok(0),
}
}
pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
}
pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {

View File

@ -8,8 +8,10 @@ mod updates;
mod updates_results;
pub use self::docs_words::DocsWords;
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter};
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
pub use self::documents_fields_counts::{
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
};
pub use self::main::Main;
pub use self::postings_lists::PostingsLists;
pub use self::synonyms::Synonyms;
@ -25,19 +27,24 @@ use zlmdb::Result as ZResult;
use crate::criterion::Criteria;
use crate::serde::Deserializer;
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
#[derive(Debug, Copy, Clone)]
#[derive(AsBytes, FromBytes)]
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentAttrKey { docid: BEU64, attr: BEU16 }
pub struct DocumentAttrKey {
docid: BEU64,
attr: BEU16,
}
impl DocumentAttrKey {
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey { docid: BEU64::new(docid.0), attr: BEU16::new(attr.0) }
DocumentAttrKey {
docid: BEU64::new(docid.0),
attr: BEU16::new(attr.0),
}
}
}
@ -93,13 +100,15 @@ impl Index {
reader: &zlmdb::RoTxn,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>>
{
) -> MResult<Option<T>> {
let schema = self.main.schema(reader)?;
let schema = schema.ok_or(Error::SchemaMissing)?;
let attributes = match attributes {
Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(),
Some(attributes) => attributes
.into_iter()
.map(|name| schema.attribute(name))
.collect(),
None => None,
};
@ -121,9 +130,10 @@ impl Index {
reader: &zlmdb::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> MResult<Option<T>>
{
let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?;
) -> MResult<Option<T>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
None => Ok(None),
@ -183,14 +193,8 @@ impl Index {
&self,
reader: &zlmdb::RoTxn,
update_id: u64,
) -> MResult<update::UpdateStatus>
{
update::update_status(
reader,
self.updates,
self.updates_results,
update_id,
)
) -> MResult<update::UpdateStatus> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn query_builder(&self) -> QueryBuilder {
@ -205,8 +209,7 @@ impl Index {
pub fn query_builder_with_criteria<'c, 'f, 'd>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd>
{
) -> QueryBuilder<'c, 'f, 'd> {
QueryBuilder::with_criteria(
self.main,
self.postings_lists,
@ -221,8 +224,7 @@ pub fn create(
env: &zlmdb::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Index>
{
) -> MResult<Index> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
@ -247,7 +249,9 @@ pub fn create(
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
updates: Updates { updates },
@ -260,8 +264,7 @@ pub fn open(
env: &zlmdb::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Option<Index>>
{
) -> MResult<Option<Index>> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
@ -310,7 +313,9 @@ pub fn open(
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
updates: Updates { updates },

View File

@ -1,8 +1,8 @@
use std::borrow::Cow;
use crate::DocIndex;
use sdset::{Set, SetBuf};
use std::borrow::Cow;
use zlmdb::types::{ByteSlice, CowSlice};
use zlmdb::Result as ZResult;
use crate::DocIndex;
#[derive(Copy, Clone)]
pub struct PostingsLists {
@ -15,8 +15,7 @@ impl PostingsLists {
writer: &mut zlmdb::RwTxn,
word: &[u8],
words_indexes: &Set<DocIndex>,
) -> ZResult<()>
{
) -> ZResult<()> {
self.postings_lists.put(writer, word, words_indexes)
}
@ -28,8 +27,7 @@ impl PostingsLists {
&self,
reader: &'txn zlmdb::RoTxn,
word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>>
{
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
match self.postings_lists.get(reader, word)? {
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),

View File

@ -13,8 +13,7 @@ impl Synonyms {
writer: &mut zlmdb::RwTxn,
word: &[u8],
synonyms: &fst::Set,
) -> ZResult<()>
{
) -> ZResult<()> {
let bytes = synonyms.as_fst().as_bytes();
self.synonyms.put(writer, word, bytes)
}
@ -30,7 +29,7 @@ impl Synonyms {
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
},
}
None => Ok(None),
}
}

View File

@ -1,13 +1,16 @@
use super::BEU64;
use crate::update::Update;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use zlmdb::types::OwnedType;
use zlmdb::{Result as ZResult, BytesEncode, BytesDecode};
use serde::{Serialize, Deserialize};
use crate::update::Update;
use super::BEU64;
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
impl<T> BytesEncode for SerdeJson<T>
where
T: Serialize,
{
type EItem = T;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
@ -15,7 +18,10 @@ impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
}
}
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T> where T: Deserialize<'a> + Clone {
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
where
T: Deserialize<'a> + Clone,
{
type DItem = T;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
@ -56,8 +62,7 @@ impl Updates {
writer: &mut zlmdb::RwTxn,
update_id: u64,
update: &Update,
) -> ZResult<()>
{
) -> ZResult<()> {
// TODO prefer using serde_json?
let update_id = BEU64::new(update_id);
self.updates.put(writer, &update_id, update)
@ -69,8 +74,8 @@ impl Updates {
let key = BEU64::new(update_id);
self.updates.delete(writer, &key)?;
Ok(Some((update_id, update)))
},
None => Ok(None)
}
None => Ok(None),
}
}
}

View File

@ -1,7 +1,7 @@
use super::BEU64;
use crate::update::UpdateResult;
use zlmdb::types::{OwnedType, Serde};
use zlmdb::Result as ZResult;
use crate::update::UpdateResult;
use super::BEU64;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
@ -21,8 +21,7 @@ impl UpdatesResults {
writer: &mut zlmdb::RwTxn,
update_id: u64,
update_result: &UpdateResult,
) -> ZResult<()>
{
) -> ZResult<()> {
let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result)
}
@ -31,8 +30,7 @@ impl UpdatesResults {
&self,
reader: &zlmdb::RoTxn,
update_id: u64,
) -> ZResult<Option<UpdateResult>>
{
) -> ZResult<Option<UpdateResult>> {
let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id)
}

View File

@ -1,13 +1,12 @@
use zlmdb::Result as ZResult;
use crate::update::{Update, next_update_id};
use crate::store;
use crate::update::{next_update_id, Update};
use zlmdb::Result as ZResult;
pub fn apply_customs_update(
writer: &mut zlmdb::RwTxn,
main_store: store::Main,
customs: &[u8],
) -> ZResult<()>
{
) -> ZResult<()> {
main_store.put_customs(writer, customs)
}
@ -16,8 +15,7 @@ pub fn push_customs_update(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
customs: Vec<u8>,
) -> ZResult<u64>
{
) -> ZResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Customs(customs);

View File

@ -1,14 +1,14 @@
use std::collections::{HashMap, HashSet};
use fst::{SetBuilder, set::OpBuilder};
use sdset::{SetOperation, duo::Union};
use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation};
use serde::Serialize;
use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
use crate::store;
use crate::update::{Update, next_update_id, apply_documents_deletion};
use crate::{MResult, Error, RankedMap};
use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> {
updates_store: store::Updates,
@ -22,8 +22,7 @@ impl<D> DocumentsAddition<D> {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsAddition<D>
{
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
@ -37,7 +36,8 @@ impl<D> DocumentsAddition<D> {
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
where D: serde::Serialize
where
D: serde::Serialize,
{
let _ = self.updates_notifier.send(());
let update_id = push_documents_addition(
@ -51,7 +51,7 @@ impl<D> DocumentsAddition<D> {
}
impl<D> Extend<D> for DocumentsAddition<D> {
fn extend<T: IntoIterator<Item=D>>(&mut self, iter: T) {
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
@ -61,8 +61,7 @@ pub fn push_documents_addition<D: serde::Serialize>(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
) -> MResult<u64>
{
) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = serde_json::to_vec(&add)?;
@ -87,8 +86,7 @@ pub fn apply_documents_addition(
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
addition: Vec<serde_json::Value>,
) -> MResult<()>
{
) -> MResult<()> {
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new();
@ -182,7 +180,7 @@ pub fn apply_documents_addition(
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
}
None => delta_words,
};

View File

@ -1,13 +1,13 @@
use std::collections::{HashMap, HashSet, BTreeSet};
use std::collections::{BTreeSet, HashMap, HashSet};
use fst::{SetBuilder, Streamer};
use meilidb_schema::Schema;
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
use crate::{DocumentId, RankedMap, MResult, Error};
use crate::serde::extract_document_id;
use crate::update::{Update, next_update_id};
use crate::store;
use crate::update::{next_update_id, Update};
use crate::{DocumentId, Error, MResult, RankedMap};
pub struct DocumentsDeletion {
updates_store: store::Updates,
@ -21,8 +21,7 @@ impl DocumentsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsDeletion
{
) -> DocumentsDeletion {
DocumentsDeletion {
updates_store,
updates_results_store,
@ -36,7 +35,8 @@ impl DocumentsDeletion {
}
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
where D: serde::Serialize,
where
D: serde::Serialize,
{
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
@ -62,7 +62,7 @@ impl DocumentsDeletion {
}
impl Extend<DocumentId> for DocumentsDeletion {
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
@ -72,8 +72,7 @@ pub fn push_documents_deletion(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: Vec<DocumentId>,
) -> MResult<u64>
{
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsDeletion(deletion);
@ -91,8 +90,7 @@ pub fn apply_documents_deletion(
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> MResult<()>
{
) -> MResult<()> {
let idset = SetBuf::from_dirty(deletion);
let schema = match main_store.schema(writer)? {
@ -101,10 +99,17 @@ pub fn apply_documents_deletion(
};
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema.iter()
.filter_map(|(_, attr, prop)| {
if prop.is_ranked() { Some(attr) } else { None }
})
let ranked_attrs: Vec<_> = schema
.iter()
.filter_map(
|(_, attr, prop)| {
if prop.is_ranked() {
Some(attr)
} else {
None
}
},
)
.collect();
let mut words_document_ids = HashMap::new();
@ -118,7 +123,10 @@ pub fn apply_documents_deletion(
let mut stream = words.stream();
while let Some(word) = stream.next() {
let word = word.to_vec();
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
words_document_ids
.entry(word)
.or_insert_with(Vec::new)
.push(id);
}
}
}
@ -167,7 +175,7 @@ pub fn apply_documents_deletion(
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
}
None => fst::Set::default(),
};

View File

@ -6,21 +6,21 @@ mod synonyms_addition;
mod synonyms_deletion;
pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::time::{Duration, Instant};
use std::collections::BTreeMap;
use std::cmp;
use std::collections::BTreeMap;
use std::time::{Duration, Instant};
use log::debug;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use zlmdb::Result as ZResult;
use crate::{store, MResult, DocumentId, RankedMap};
use crate::{store, DocumentId, MResult, RankedMap};
use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -68,8 +68,7 @@ pub fn update_status(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
) -> MResult<UpdateStatus>
{
) -> MResult<UpdateStatus> {
match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)),
None => {
@ -86,8 +85,7 @@ pub fn next_update_id(
writer: &mut zlmdb::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> ZResult<u64>
{
) -> ZResult<u64> {
let last_update_id = updates_store.last_update_id(writer)?;
let last_update_id = last_update_id.map(|(n, _)| n);
@ -100,7 +98,10 @@ pub fn next_update_id(
Ok(new_update_id)
}
pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Option<UpdateResult>> {
pub fn update_task(
writer: &mut zlmdb::RwTxn,
index: store::Index,
) -> MResult<Option<UpdateResult>> {
let (update_id, update) = match index.updates.pop_front(writer)? {
Some(value) => value,
None => return Ok(None),
@ -112,11 +113,13 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
Update::Schema(schema) => {
let start = Instant::now();
let update_type = UpdateType::Schema { schema: schema.clone() };
let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let result = apply_schema_update(writer, index.main, &schema);
(update_type, result, start.elapsed())
},
}
Update::Customs(customs) => {
let start = Instant::now();
@ -133,7 +136,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
let result = apply_documents_addition(
writer,
@ -147,7 +152,7 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
);
(update_type, result, start.elapsed())
},
}
Update::DocumentsDeletion(documents) => {
let start = Instant::now();
@ -156,7 +161,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
let result = apply_documents_deletion(
writer,
@ -170,38 +177,35 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
);
(update_type, result, start.elapsed())
},
}
Update::SynonymsAddition(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
let update_type = UpdateType::SynonymsAddition {
number: synonyms.len(),
};
let result = apply_synonyms_addition(
writer,
index.main,
index.synonyms,
synonyms,
);
let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
},
}
Update::SynonymsDeletion(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
let update_type = UpdateType::SynonymsDeletion {
number: synonyms.len(),
};
let result = apply_synonyms_deletion(
writer,
index.main,
index.synonyms,
synonyms,
);
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
},
}
};
debug!("Processed update number {} {:?} {:?}", update_id, update_type, result);
debug!(
"Processed update number {} {:?} {:?}",
update_id, update_type, result
);
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult {
@ -211,7 +215,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
detailed_duration,
};
index.updates_results.put_update_result(writer, update_id, &status)?;
index
.updates_results
.put_update_result(writer, update_id, &status)?;
Ok(Some(status))
}

View File

@ -1,18 +1,19 @@
use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema;
use crate::{store, error::UnsupportedOperation, MResult};
use crate::update::{Update, next_update_id};
pub fn apply_schema_update(
writer: &mut zlmdb::RwTxn,
main_store: store::Main,
new_schema: &Schema,
) -> MResult<()>
{
) -> MResult<()> {
if let Some(_) = main_store.schema(writer)? {
return Err(UnsupportedOperation::SchemaAlreadyExists.into())
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
}
main_store.put_schema(writer, new_schema).map_err(Into::into)
main_store
.put_schema(writer, new_schema)
.map_err(Into::into)
}
pub fn push_schema_update(
@ -20,8 +21,7 @@ pub fn push_schema_update(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
schema: Schema,
) -> MResult<u64>
{
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Schema(schema);

View File

@ -1,10 +1,10 @@
use std::collections::BTreeMap;
use fst::{SetBuilder, set::OpBuilder};
use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf;
use crate::automaton::normalize_str;
use crate::update::{Update, next_update_id};
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct SynonymsAddition {
@ -19,8 +19,7 @@ impl SynonymsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsAddition
{
) -> SynonymsAddition {
SynonymsAddition {
updates_store,
updates_results_store,
@ -30,13 +29,17 @@ impl SynonymsAddition {
}
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item=T>,
where
S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item = T>,
{
let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
self.synonyms
.entry(synonym)
.or_insert_with(Vec::new)
.extend(alternatives);
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
@ -56,8 +59,7 @@ pub fn push_synonyms_addition(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>,
) -> MResult<u64>
{
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsAddition(addition);
@ -71,8 +73,7 @@ pub fn apply_synonyms_addition(
main_store: store::Main,
synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>,
) -> MResult<()>
{
) -> MResult<()> {
let mut synonyms_builder = SetBuilder::memory();
for (word, alternatives) in addition {
@ -107,7 +108,7 @@ pub fn apply_synonyms_addition(
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
}
None => delta_synonyms,
};

View File

@ -1,11 +1,11 @@
use std::collections::BTreeMap;
use std::iter::FromIterator;
use fst::{SetBuilder, set::OpBuilder};
use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf;
use crate::automaton::normalize_str;
use crate::update::{Update, next_update_id};
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct SynonymsDeletion {
@ -20,8 +20,7 @@ impl SynonymsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsDeletion
{
) -> SynonymsDeletion {
SynonymsDeletion {
updates_store,
updates_results_store,
@ -36,9 +35,10 @@ impl SynonymsDeletion {
}
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item=T>,
where
S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item = T>,
{
let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None);
@ -66,8 +66,7 @@ pub fn push_synonyms_deletion(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<u64>
{
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsDeletion(deletion);
@ -81,8 +80,7 @@ pub fn apply_synonyms_deletion(
main_store: store::Main,
synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<()>
{
) -> MResult<()> {
let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion {
@ -98,9 +96,7 @@ pub fn apply_synonyms_deletion(
let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap();
builder.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
};
let op = OpBuilder::new()
@ -124,7 +120,7 @@ pub fn apply_synonyms_deletion(
} else {
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
}
},
}
None => {
delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
@ -150,7 +146,7 @@ pub fn apply_synonyms_deletion(
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
}
None => fst::Set::default(),
};

View File

@ -1,14 +1,26 @@
use std::collections::{HashMap, BTreeMap};
use std::{fmt, u16};
use std::collections::{BTreeMap, HashMap};
use std::ops::BitOr;
use std::sync::Arc;
use std::{fmt, u16};
use serde::{Serialize, Deserialize};
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
pub const DISPLAYED: SchemaProps = SchemaProps {
displayed: true,
indexed: false,
ranked: false,
};
pub const INDEXED: SchemaProps = SchemaProps {
displayed: false,
indexed: true,
ranked: false,
};
pub const RANKED: SchemaProps = SchemaProps {
displayed: false,
indexed: false,
ranked: true,
};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
@ -80,7 +92,13 @@ impl SchemaBuilder {
}
let identifier = self.identifier;
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
Schema {
inner: Arc::new(InnerSchema {
identifier,
attrs,
props,
}),
}
}
}
@ -100,7 +118,10 @@ impl Schema {
fn to_builder(&self) -> SchemaBuilder {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
SchemaBuilder { identifier, attributes }
SchemaBuilder {
identifier,
attributes,
}
}
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
@ -136,18 +157,18 @@ impl Schema {
name
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter()
.map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
pub fn iter<'a>(&'a self) -> impl Iterator<Item = (&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter().map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
}
}
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: serde::ser::Serializer,
where
S: serde::ser::Serializer,
{
self.to_builder().serialize(serializer)
}
@ -155,15 +176,15 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: serde::de::Deserializer<'de>,
where
D: serde::de::Deserializer<'de>,
{
let builder = SchemaBuilder::deserialize(deserializer)?;
Ok(builder.build())
}
}
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16);
impl SchemaAttr {

View File

@ -1,17 +1,17 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*;
use slice_group_by::StrGroupBy;
use std::iter::Peekable;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
(c >= '\u{2e80}' && c <= '\u{2eff}')
|| (c >= '\u{2f00}' && c <= '\u{2fdf}')
|| (c >= '\u{3040}' && c <= '\u{309f}')
|| (c >= '\u{30a0}' && c <= '\u{30ff}')
|| (c >= '\u{3100}' && c <= '\u{312f}')
|| (c >= '\u{3200}' && c <= '\u{32ff}')
|| (c >= '\u{3400}' && c <= '\u{4dbf}')
|| (c >= '\u{4e00}' && c <= '\u{9fff}')
|| (c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
@ -22,7 +22,11 @@ enum SeparatorCategory {
impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
if let (Soft, Soft) = (self, other) {
Soft
} else {
Hard
}
}
fn to_usize(self) -> usize {
@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool {
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None,
}
}
@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u
(n + 1, i + c.len_utf8())
}
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
Tokenizer::new(query).map(|t| t.word)
}
@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index`
// to the number of char trimmed
let (count, index) = string.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
let (count, index) = string
.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer {
inner: &string[index..],
@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) {
self.word_index += string.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.word_index += string
.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count;
self.inner = &self.inner[index..];
continue;
@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> {
}
pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
where
I: Iterator<Item = &'a str>,
{
inner: I,
current: Option<Peekable<Tokenizer<'a>>>,
@ -162,7 +169,8 @@ where I: Iterator<Item=&'a str>,
}
impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
where
I: Iterator<Item = &'a str>,
{
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
@ -176,7 +184,8 @@ where I: Iterator<Item=&'a str>,
}
impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
where
I: Iterator<Item = &'a str>,
{
type Item = Token<'a>;
@ -202,15 +211,15 @@ where I: Iterator<Item=&'a str>,
}
Some(token)
},
}
None => {
// no more words in this text we must
// start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next()
},
}
}
},
}
// no more texts available
None => None,
}
@ -225,12 +234,26 @@ mod tests {
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "salut",
word_index: 0,
char_index: 0
})
);
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 0
})
);
assert_eq!(tokenizer.next(), None);
}
@ -238,19 +261,82 @@ mod tests {
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 1,
char_index: 7
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 9,
char_index: 13
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "ouch",
word_index: 17,
char_index: 18
})
);
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 8,
char_index: 5
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "wtf",
word_index: 16,
char_index: 12
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lol",
word_index: 17,
char_index: 18
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 25,
char_index: 24
})
);
assert_eq!(tokenizer.next(), None);
}
@ -258,18 +344,74 @@ mod tests {
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😂",
word_index: 1,
char_index: 7
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 9,
char_index: 10
})
);
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 8,
char_index: 5
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😱",
word_index: 16,
char_index: 12
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lol",
word_index: 17,
char_index: 16
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😣",
word_index: 25,
char_index: 22
})
);
assert_eq!(tokenizer.next(), None);
}
@ -277,19 +419,82 @@ mod tests {
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec4}",
word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolilol",
word_index: 1,
char_index: 1
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec7}",
word_index: 2,
char_index: 8
})
);
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec4}",
word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ed3}",
word_index: 1,
char_index: 1
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ef2}",
word_index: 2,
char_index: 2
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolilol",
word_index: 3,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "hello",
word_index: 4,
char_index: 14
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec7}",
word_index: 5,
char_index: 23
})
);
assert_eq!(tokenizer.next(), None);
}
}