mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-27 05:37:31 +01:00
Cargo fmt pass
This commit is contained in:
parent
47d777c8f7
commit
ca26a0f2e4
@ -4,15 +4,15 @@ use std::error::Error;
|
||||
use std::io::Write;
|
||||
use std::iter::FromIterator;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::{Instant, Duration};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{fs, io, sync::mpsc};
|
||||
|
||||
use rustyline::{Editor, Config};
|
||||
use serde::{Serialize, Deserialize};
|
||||
use rustyline::{Config, Editor};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use structopt::StructOpt;
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
|
||||
use meilidb_core::{Highlight, Database, UpdateResult};
|
||||
use meilidb_core::{Database, Highlight, UpdateResult};
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
const INDEX_NAME: &str = "default";
|
||||
@ -91,7 +91,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
|
||||
let index = match database.open_index(INDEX_NAME) {
|
||||
Some(index) => index,
|
||||
None => database.create_index(INDEX_NAME).unwrap()
|
||||
None => database.create_index(INDEX_NAME).unwrap(),
|
||||
};
|
||||
|
||||
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
|
||||
@ -108,14 +108,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
match index.main.schema(&writer)? {
|
||||
Some(current_schema) => {
|
||||
if current_schema != schema {
|
||||
return Err(meilidb_core::Error::SchemaDiffer.into())
|
||||
return Err(meilidb_core::Error::SchemaDiffer.into());
|
||||
}
|
||||
writer.abort();
|
||||
},
|
||||
}
|
||||
None => {
|
||||
index.schema_update(&mut writer, schema)?;
|
||||
writer.commit().unwrap();
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
|
||||
@ -131,7 +131,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
if end_of_file {
|
||||
break;
|
||||
}
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
@ -147,7 +149,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = command.update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
if i % group_size == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -163,15 +167,25 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
|
||||
println!("Waiting for update {}", max_update_id);
|
||||
for id in receiver {
|
||||
if id == max_update_id { break }
|
||||
if id == max_update_id {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path);
|
||||
println!(
|
||||
"database created in {:.2?} at: {:?}",
|
||||
start.elapsed(),
|
||||
command.database_path
|
||||
);
|
||||
|
||||
if let Some(path) = command.compact_to_path {
|
||||
let start = Instant::now();
|
||||
let _file = database.copy_and_compact_to_path(&path)?;
|
||||
println!("database compacted in {:.2?} at: {:?}", start.elapsed(), path);
|
||||
println!(
|
||||
"database compacted in {:.2?} at: {:?}",
|
||||
start.elapsed(),
|
||||
path
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -182,7 +196,10 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut highlighted = false;
|
||||
|
||||
for range in ranges.windows(2) {
|
||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
||||
let [start, end] = match range {
|
||||
[start, end] => [*start, *end],
|
||||
_ => unreachable!(),
|
||||
};
|
||||
if highlighted {
|
||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||
}
|
||||
@ -221,12 +238,14 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -252,22 +271,23 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
|
||||
/// ```
|
||||
fn crop_text(
|
||||
text: &str,
|
||||
highlights: impl IntoIterator<Item=Highlight>,
|
||||
highlights: impl IntoIterator<Item = Highlight>,
|
||||
context: usize,
|
||||
) -> (String, Vec<Highlight>)
|
||||
{
|
||||
) -> (String, Vec<Highlight>) {
|
||||
let mut highlights = highlights.into_iter().peekable();
|
||||
|
||||
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
||||
let char_index = highlights
|
||||
.peek()
|
||||
.map(|m| m.char_index as usize)
|
||||
.unwrap_or(0);
|
||||
let start = char_index.saturating_sub(context);
|
||||
let text = text.chars().skip(start).take(context * 2).collect();
|
||||
|
||||
let highlights = highlights
|
||||
.take_while(|m| {
|
||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||
})
|
||||
.map(|highlight| {
|
||||
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
|
||||
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
|
||||
.map(|highlight| Highlight {
|
||||
char_index: highlight.char_index - start as u16,
|
||||
..highlight
|
||||
})
|
||||
.collect();
|
||||
|
||||
@ -276,7 +296,9 @@ fn crop_text(
|
||||
|
||||
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
|
||||
let env = &database.env;
|
||||
let index = database.open_index(INDEX_NAME).expect("Could not find index");
|
||||
let index = database
|
||||
.open_index(INDEX_NAME)
|
||||
.expect("Could not find index");
|
||||
|
||||
let reader = env.read_txn().unwrap();
|
||||
let schema = index.main.schema(&reader)?;
|
||||
@ -312,10 +334,15 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
(true, filter)
|
||||
};
|
||||
|
||||
let attr = schema.attribute(&filter).expect("Could not find filtered attribute");
|
||||
let attr = schema
|
||||
.attribute(&filter)
|
||||
.expect("Could not find filtered attribute");
|
||||
|
||||
builder.with_filter(move |document_id| {
|
||||
let string: String = ref_index.document_attribute(ref_reader, document_id, attr).unwrap().unwrap();
|
||||
let string: String = ref_index
|
||||
.document_attribute(ref_reader, document_id, attr)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
(string == "true") == positive
|
||||
});
|
||||
}
|
||||
@ -326,8 +353,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
|
||||
let number_of_documents = documents.len();
|
||||
for mut doc in documents {
|
||||
|
||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
doc.highlights
|
||||
.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
|
||||
let start_retrieve = Instant::now();
|
||||
let result = index.document::<Document>(&reader, Some(&fields), doc.id);
|
||||
@ -340,15 +367,18 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
print!("{}: ", name);
|
||||
|
||||
let attr = schema.attribute(&name).unwrap();
|
||||
let highlights = doc.highlights.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, highlights) = crop_text(&text, highlights, command.char_context);
|
||||
let highlights = doc
|
||||
.highlights
|
||||
.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, highlights) =
|
||||
crop_text(&text, highlights, command.char_context);
|
||||
let areas = create_highlight_areas(&text, &highlights);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
}
|
||||
Ok(None) => eprintln!("missing document"),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
@ -366,12 +396,19 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration);
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
||||
},
|
||||
eprintln!(
|
||||
"whole documents fields retrieve took {:.2?}",
|
||||
retrieve_duration
|
||||
);
|
||||
eprintln!(
|
||||
"===== Found {} results in {:.2?} =====",
|
||||
number_of_documents,
|
||||
start_total.elapsed()
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Error: {:?}", err);
|
||||
break
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,5 @@
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use once_cell::sync::OnceCell;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA,
|
||||
};
|
||||
|
||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||
@ -15,30 +12,30 @@ enum PrefixSetting {
|
||||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||
use PrefixSetting::{Prefix, NoPrefix};
|
||||
use PrefixSetting::{NoPrefix, Prefix};
|
||||
|
||||
match query.len() {
|
||||
0 ..= 4 => {
|
||||
0..=4 => {
|
||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
5 ..= 8 => {
|
||||
}
|
||||
5..=8 => {
|
||||
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,14 +6,14 @@ use std::vec;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilidb_tokenizer::{split_query_string, is_cjk};
|
||||
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||
|
||||
use crate::store;
|
||||
use crate::error::MResult;
|
||||
use crate::store;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
@ -27,14 +27,9 @@ impl AutomatonProducer {
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)>
|
||||
{
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) =
|
||||
generate_automatons(reader, query, main_store, synonyms_store)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
@ -112,8 +107,7 @@ fn generate_automatons(
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)>
|
||||
{
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
@ -130,7 +124,6 @@ fn generate_automatons(
|
||||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
@ -148,29 +141,33 @@ fn generate_automatons(
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words { continue }
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
@ -178,7 +175,11 @@ fn generate_automatons(
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
enhancer_builder.declare(
|
||||
query_range.clone(),
|
||||
real_query_index,
|
||||
&synonyms_words,
|
||||
);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
return false;
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
@ -49,7 +50,9 @@ where S: AsRef<str>,
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
!original
|
||||
.map(AsRef::as_ref)
|
||||
.eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
@ -68,11 +71,20 @@ impl FakeIntervalTree {
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
if point < r.end {
|
||||
Equal
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
} else {
|
||||
Greater
|
||||
}
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
let n = match element {
|
||||
Ok(n) => n,
|
||||
Err(n) => n,
|
||||
};
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
@ -91,9 +103,13 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
QueryEnhancerBuilder {
|
||||
query,
|
||||
origins,
|
||||
real_to_origin,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
@ -101,12 +117,12 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
where
|
||||
T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
@ -126,7 +142,8 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
self.real_to_origin
|
||||
.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
@ -148,10 +165,10 @@ impl QueryEnhancer {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
let (range, (origin, real_length)) = self
|
||||
.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
@ -160,7 +177,10 @@ impl QueryEnhancer {
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
@ -168,15 +188,20 @@ impl QueryEnhancer {
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
}
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -382,16 +407,16 @@ mod tests {
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::cmp::Ordering;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use sdset::Set;
|
||||
use slice_group_by::GroupBy;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
@ -13,8 +13,7 @@ fn number_exact_matches(
|
||||
attribute: &[u16],
|
||||
is_exact: &[bool],
|
||||
fields_counts: &Set<(SchemaAttr, u64)>,
|
||||
) -> usize
|
||||
{
|
||||
) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
@ -22,12 +21,16 @@ fn number_exact_matches(
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() {
|
||||
for (pos, _) in is_exact[index..index + len]
|
||||
.iter()
|
||||
.filter(|x| **x)
|
||||
.enumerate()
|
||||
{
|
||||
found_exact = true;
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value()
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -81,18 +84,18 @@ mod tests {
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[false];
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[false];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
@ -108,18 +111,18 @@ mod tests {
|
||||
#[test]
|
||||
fn basic() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
|
@ -1,24 +1,20 @@
|
||||
mod sum_of_typos;
|
||||
mod document_id;
|
||||
mod exact;
|
||||
mod number_of_words;
|
||||
mod words_proximity;
|
||||
mod sort_by_attr;
|
||||
mod sum_of_typos;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
mod sort_by_attr;
|
||||
mod document_id;
|
||||
mod words_proximity;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
pub use self::{
|
||||
sum_of_typos::SumOfTypos,
|
||||
number_of_words::NumberOfWords,
|
||||
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
|
||||
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
|
||||
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
|
||||
words_proximity::WordsProximity,
|
||||
sum_of_words_attribute::SumOfWordsAttribute,
|
||||
sum_of_words_position::SumOfWordsPosition,
|
||||
exact::Exact,
|
||||
sort_by_attr::SortByAttr,
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
@ -62,17 +58,18 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>
|
||||
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> CriteriaBuilder<'a>
|
||||
{
|
||||
impl<'a> CriteriaBuilder<'a> {
|
||||
pub fn new() -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
||||
CriteriaBuilder {
|
||||
inner: Vec::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
@ -80,14 +77,16 @@ impl<'a> CriteriaBuilder<'a>
|
||||
}
|
||||
|
||||
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||
where C: Criterion,
|
||||
where
|
||||
C: Criterion,
|
||||
{
|
||||
self.push(criterion);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||
where C: Criterion,
|
||||
where
|
||||
C: Criterion,
|
||||
{
|
||||
self.inner.push(Box::new(criterion));
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
|
@ -2,9 +2,9 @@ use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{RawDocument, RankedMap};
|
||||
use crate::{RankedMap, RawDocument};
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
@ -51,8 +51,7 @@ impl<'a> SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||
}
|
||||
|
||||
@ -60,8 +59,7 @@ impl<'a> SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||
}
|
||||
|
||||
@ -70,8 +68,7 @@ impl<'a> SortByAttr<'a> {
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
reversed: bool,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||
let attr = match schema.attribute(attr_name) {
|
||||
Some(attr) => attr,
|
||||
None => return Err(SortByAttrError::AttributeNotFound),
|
||||
@ -81,7 +78,11 @@ impl<'a> SortByAttr<'a> {
|
||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||
}
|
||||
|
||||
Ok(SortByAttr { ranked_map, attr, reversed })
|
||||
Ok(SortByAttr {
|
||||
ranked_map,
|
||||
attr,
|
||||
reversed,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -93,11 +94,15 @@ impl<'a> Criterion for SortByAttr<'a> {
|
||||
match (lhs, rhs) {
|
||||
(Some(lhs), Some(rhs)) => {
|
||||
let order = lhs.cmp(&rhs);
|
||||
if self.reversed { order.reverse() } else { order }
|
||||
},
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
if self.reversed {
|
||||
order.reverse()
|
||||
} else {
|
||||
order
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
|
||||
@ -122,4 +127,4 @@ impl fmt::Display for SortByAttrError {
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SortByAttrError { }
|
||||
impl Error for SortByAttrError {}
|
||||
|
@ -11,10 +11,10 @@ use crate::RawDocument;
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
@ -19,7 +19,9 @@ fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
}
|
||||
|
||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||
if lattr != rattr { return MAX_DISTANCE }
|
||||
if lattr != rattr {
|
||||
return MAX_DISTANCE;
|
||||
}
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
@ -42,15 +44,18 @@ fn matches_proximity(
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u16],
|
||||
) -> u16
|
||||
{
|
||||
) -> u16 {
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut index = 0;
|
||||
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
|
||||
let len = distance[index..index + group_len]
|
||||
.linear_group()
|
||||
.next()
|
||||
.unwrap()
|
||||
.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
@ -110,7 +115,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
@ -120,19 +124,21 @@ mod tests {
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
17
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
@ -143,13 +149,16 @@ mod tests {
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
3
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1,13 +1,13 @@
|
||||
use std::collections::hash_map::{HashMap, Entry};
|
||||
use std::collections::hash_map::{Entry, HashMap};
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fs, thread};
|
||||
|
||||
use zlmdb::{Result as ZResult, CompactionOption};
|
||||
use zlmdb::types::{Str, Unit};
|
||||
use crossbeam_channel::Receiver;
|
||||
use log::{debug, error};
|
||||
use zlmdb::types::{Str, Unit};
|
||||
use zlmdb::{CompactionOption, Result as ZResult};
|
||||
|
||||
use crate::{store, update, Index, MResult};
|
||||
|
||||
@ -32,20 +32,32 @@ fn update_awaiter(
|
||||
loop {
|
||||
let mut writer = match env.write_txn() {
|
||||
Ok(writer) => writer,
|
||||
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
|
||||
Err(e) => {
|
||||
error!("LMDB writer transaction begin failed: {}", e);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
match update::update_task(&mut writer, index.clone()) {
|
||||
Ok(Some(status)) => {
|
||||
if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) }
|
||||
if let Err(e) = writer.commit() {
|
||||
error!("update transaction failed: {}", e)
|
||||
}
|
||||
|
||||
if let Some(ref callback) = *update_fn.load() {
|
||||
(callback)(status);
|
||||
}
|
||||
},
|
||||
}
|
||||
// no more updates to handle for now
|
||||
Ok(None) => { debug!("no more updates"); writer.abort(); break },
|
||||
Err(e) => { error!("update task failed: {}", e); writer.abort() },
|
||||
Ok(None) => {
|
||||
debug!("no more updates");
|
||||
writer.abort();
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("update task failed: {}", e);
|
||||
writer.abort()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -76,14 +88,16 @@ impl Database {
|
||||
// open the previously aggregated indexes
|
||||
let mut indexes = HashMap::new();
|
||||
for index_name in must_open {
|
||||
|
||||
let (sender, receiver) = crossbeam_channel::bounded(100);
|
||||
let index = match store::open(&env, &index_name, sender.clone())? {
|
||||
Some(index) => index,
|
||||
None => {
|
||||
log::warn!("the index {} doesn't exist or has not all the databases", index_name);
|
||||
log::warn!(
|
||||
"the index {} doesn't exist or has not all the databases",
|
||||
index_name
|
||||
);
|
||||
continue;
|
||||
},
|
||||
}
|
||||
};
|
||||
let update_fn = Arc::new(ArcSwapFn::empty());
|
||||
|
||||
@ -100,10 +114,18 @@ impl Database {
|
||||
sender.send(()).unwrap();
|
||||
|
||||
let result = indexes.insert(index_name, (index, update_fn, handle));
|
||||
assert!(result.is_none(), "The index should not have been already open");
|
||||
assert!(
|
||||
result.is_none(),
|
||||
"The index should not have been already open"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Database { env, common_store, indexes_store, indexes: RwLock::new(indexes) })
|
||||
Ok(Database {
|
||||
env,
|
||||
common_store,
|
||||
indexes_store,
|
||||
indexes: RwLock::new(indexes),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
|
||||
@ -152,7 +174,7 @@ impl Database {
|
||||
let update_fn = Some(Arc::new(update_fn));
|
||||
current_update_fn.swap(update_fn);
|
||||
true
|
||||
},
|
||||
}
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
@ -160,7 +182,10 @@ impl Database {
|
||||
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
|
||||
let indexes_lock = self.indexes.read().unwrap();
|
||||
match indexes_lock.get(name.as_ref()) {
|
||||
Some((_, current_update_fn, _)) => { current_update_fn.swap(None); true },
|
||||
Some((_, current_update_fn, _)) => {
|
||||
current_update_fn.swap(None);
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::hash::Hash;
|
||||
use hashbrown::HashMap;
|
||||
use std::hash::Hash;
|
||||
|
||||
pub struct DistinctMap<K> {
|
||||
inner: HashMap<K, usize>,
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::{error, fmt, io};
|
||||
use crate::serde::{DeserializerError, SerializerError};
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
use crate::serde::{SerializerError, DeserializerError};
|
||||
use std::{error, fmt, io};
|
||||
|
||||
pub type MResult<T> = Result<T, Error>;
|
||||
|
||||
@ -90,7 +90,7 @@ impl fmt::Display for Error {
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for Error { }
|
||||
impl error::Error for Error {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum UnsupportedOperation {
|
||||
|
@ -1,7 +1,9 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use] extern crate assert_matches;
|
||||
#[macro_use]
|
||||
extern crate assert_matches;
|
||||
|
||||
mod automaton;
|
||||
pub mod criterion;
|
||||
mod database;
|
||||
mod distinct_map;
|
||||
mod error;
|
||||
@ -9,31 +11,41 @@ mod number;
|
||||
mod query_builder;
|
||||
mod ranked_map;
|
||||
mod raw_document;
|
||||
mod reordered_attrs;
|
||||
mod update;
|
||||
pub mod criterion;
|
||||
pub mod raw_indexer;
|
||||
mod reordered_attrs;
|
||||
pub mod serde;
|
||||
pub mod store;
|
||||
mod update;
|
||||
|
||||
pub use self::database::{Database, BoxUpdateFn};
|
||||
pub use self::database::{BoxUpdateFn, Database};
|
||||
pub use self::error::{Error, MResult};
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
pub use self::ranked_map::RankedMap;
|
||||
pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Index;
|
||||
pub use self::update::{UpdateStatus, UpdateResult, UpdateType};
|
||||
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
|
||||
|
||||
use ::serde::{Deserialize, Serialize};
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
use ::serde::{Serialize, Deserialize};
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(AsBytes, FromBytes)]
|
||||
#[derive(
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
Eq,
|
||||
PartialEq,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
Hash,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
AsBytes,
|
||||
FromBytes,
|
||||
)]
|
||||
#[repr(C)]
|
||||
pub struct DocumentId(pub u64);
|
||||
|
||||
@ -42,8 +54,7 @@ pub struct DocumentId(pub u64);
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[derive(AsBytes, FromBytes)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
@ -109,7 +120,10 @@ pub struct Document {
|
||||
impl Document {
|
||||
#[cfg(not(test))]
|
||||
fn from_raw(raw: RawDocument) -> Document {
|
||||
Document { id: raw.id, highlights: raw.highlights }
|
||||
Document {
|
||||
id: raw.id,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -134,7 +148,11 @@ impl Document {
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document { id: raw.id, matches, highlights: raw.highlights }
|
||||
Document {
|
||||
id: raw.id,
|
||||
matches,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
use std::num::{ParseIntError, ParseFloatError};
|
||||
use std::str::FromStr;
|
||||
use std::fmt;
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::str::FromStr;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Number {
|
||||
Unsigned(u64),
|
||||
Signed(i64),
|
||||
@ -32,7 +31,11 @@ impl FromStr for Number {
|
||||
Err(error) => error,
|
||||
};
|
||||
|
||||
Err(ParseNumberError { uint_error, int_error, float_error })
|
||||
Err(ParseNumberError {
|
||||
uint_error,
|
||||
int_error,
|
||||
float_error,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -46,10 +49,17 @@ pub struct ParseNumberError {
|
||||
impl fmt::Display for ParseNumberError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.uint_error == self.int_error {
|
||||
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
|
||||
write!(
|
||||
f,
|
||||
"can not parse number: {}, {}",
|
||||
self.uint_error, self.float_error
|
||||
)
|
||||
} else {
|
||||
write!(f, "can not parse number: {}, {}, {}",
|
||||
self.uint_error, self.int_error, self.float_error)
|
||||
write!(
|
||||
f,
|
||||
"can not parse number: {}, {}, {}",
|
||||
self.uint_error, self.int_error, self.float_error
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,17 +2,17 @@ use hashbrown::HashMap;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::time::{Instant, Duration};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
|
||||
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::raw_document::{RawDocument, raw_documents_from};
|
||||
use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria};
|
||||
use crate::{store, MResult, reordered_attrs::ReorderedAttrs};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
||||
|
||||
pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
criteria: Criteria<'c>,
|
||||
@ -29,8 +29,7 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
fn multiword_rewrite_matches(
|
||||
mut matches: Vec<(DocumentId, TmpMatch)>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
) -> SetBuf<(DocumentId, TmpMatch)>
|
||||
{
|
||||
) -> SetBuf<(DocumentId, TmpMatch)> {
|
||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||
|
||||
// we sort the matches by word index to make them rewritable
|
||||
@ -38,7 +37,6 @@ fn multiword_rewrite_matches(
|
||||
|
||||
// for each attribute of each document
|
||||
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
|
||||
|
||||
// padding will only be applied
|
||||
// to word indices in the same attribute
|
||||
let mut padding = 0;
|
||||
@ -47,18 +45,20 @@ fn multiword_rewrite_matches(
|
||||
// for each match at the same position
|
||||
// in this document attribute
|
||||
while let Some(same_word_index) = iter.next() {
|
||||
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for (id, match_) in same_word_index {
|
||||
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index);
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..match_.clone()
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
@ -67,22 +67,30 @@ fn multiword_rewrite_matches(
|
||||
// look ahead and if there already is a match
|
||||
// corresponding to this padding word, abort the padding
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let padmatch = TmpMatch { query_index, word_index, ..match_.clone() };
|
||||
let padmatch = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..match_.clone()
|
||||
};
|
||||
|
||||
for (_, nmatch_) in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index);
|
||||
let query_index = rep.next().unwrap();
|
||||
if query_index == padmatch.query_index {
|
||||
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
let word_index =
|
||||
match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..match_.clone()
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
biggest = biggest.max(i + 1);
|
||||
}
|
||||
@ -97,7 +105,7 @@ fn multiword_rewrite_matches(
|
||||
|
||||
// if we do not find a corresponding padding in the
|
||||
// next groups so stop here and pad what was found
|
||||
break
|
||||
break;
|
||||
}
|
||||
|
||||
if !found {
|
||||
@ -105,7 +113,11 @@ fn multiword_rewrite_matches(
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..match_.clone()
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
@ -132,13 +144,17 @@ fn fetch_raw_documents(
|
||||
main_store: &store::Main,
|
||||
postings_lists_store: &store::PostingsLists,
|
||||
documents_fields_counts_store: &store::DocumentsFieldsCounts,
|
||||
) -> MResult<Vec<RawDocument>>
|
||||
{
|
||||
) -> MResult<Vec<RawDocument>> {
|
||||
let mut matches = Vec::new();
|
||||
let mut highlights = Vec::new();
|
||||
|
||||
for automaton in automatons {
|
||||
let Automaton { index, is_exact, query_len, .. } = automaton;
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
@ -210,8 +226,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
postings_lists: store::PostingsLists,
|
||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||
synonyms: store::Synonyms,
|
||||
) -> QueryBuilder<'c, 'f, 'd>
|
||||
{
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder::with_criteria(
|
||||
main,
|
||||
postings_lists,
|
||||
@ -227,8 +242,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||
synonyms: store::Synonyms,
|
||||
criteria: Criteria<'c>,
|
||||
) -> QueryBuilder<'c, 'f, 'd>
|
||||
{
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder {
|
||||
criteria,
|
||||
searchable_attrs: None,
|
||||
@ -245,7 +259,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
|
||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
pub fn with_filter<F>(&mut self, function: F)
|
||||
where F: Fn(DocumentId) -> bool + 'f,
|
||||
where
|
||||
F: Fn(DocumentId) -> bool + 'f,
|
||||
{
|
||||
self.filter = Some(Box::new(function))
|
||||
}
|
||||
@ -255,13 +270,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
}
|
||||
|
||||
pub fn with_distinct<F, K>(&mut self, function: F, size: usize)
|
||||
where F: Fn(DocumentId) -> Option<u64> + 'd,
|
||||
where
|
||||
F: Fn(DocumentId) -> Option<u64> + 'd,
|
||||
{
|
||||
self.distinct = Some((Box::new(function), size))
|
||||
}
|
||||
|
||||
pub fn add_searchable_attribute(&mut self, attribute: u16) {
|
||||
let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new);
|
||||
let reorders = self
|
||||
.searchable_attrs
|
||||
.get_or_insert_with(ReorderedAttrs::new);
|
||||
reorders.insert_attribute(attribute);
|
||||
}
|
||||
|
||||
@ -270,41 +288,36 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
reader: &zlmdb::RoTxn,
|
||||
query: &str,
|
||||
range: Range<usize>,
|
||||
) -> MResult<Vec<Document>>
|
||||
{
|
||||
) -> MResult<Vec<Document>> {
|
||||
match self.distinct {
|
||||
Some((distinct, distinct_size)) => {
|
||||
raw_query_with_distinct(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
distinct,
|
||||
distinct_size,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
)
|
||||
},
|
||||
None => {
|
||||
raw_query(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
)
|
||||
}
|
||||
Some((distinct, distinct_size)) => raw_query_with_distinct(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
distinct,
|
||||
distinct_size,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
),
|
||||
None => raw_query(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -326,7 +339,8 @@ fn raw_query<'c, FI>(
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
{
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
@ -347,18 +361,14 @@ where FI: Fn(DocumentId) -> bool,
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
synonyms_store,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
|
||||
let mut automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
@ -382,7 +392,7 @@ where FI: Fn(DocumentId) -> bool,
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||
break
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -409,20 +419,27 @@ where FI: Fn(DocumentId) -> bool,
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if documents_seen >= range.end { continue 'criteria }
|
||||
if documents_seen >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// once we classified the documents related to the current
|
||||
// automatons we save that as the next valid result
|
||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||
let iter = raw_documents
|
||||
.into_iter()
|
||||
.skip(range.start)
|
||||
.take(range.len());
|
||||
raw_documents_processed.clear();
|
||||
raw_documents_processed.extend(iter);
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if start_processing.elapsed() > timeout { break }
|
||||
if start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -456,18 +473,15 @@ fn raw_query_with_distinct<'c, FI, FD>(
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<u64>,
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<u64>,
|
||||
{
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::new();
|
||||
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
|
||||
let mut automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
@ -491,7 +505,7 @@ where FI: Fn(DocumentId) -> bool,
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||
break
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -528,7 +542,7 @@ where FI: Fn(DocumentId) -> bool,
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id))
|
||||
},
|
||||
}
|
||||
None => true,
|
||||
};
|
||||
|
||||
@ -543,7 +557,9 @@ where FI: Fn(DocumentId) -> bool,
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end { break }
|
||||
if buf_distinct.len() >= range.end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
documents_seen += group.len();
|
||||
@ -558,7 +574,9 @@ where FI: Fn(DocumentId) -> bool,
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if buf_distinct.len() >= range.end { continue 'criteria }
|
||||
if buf_distinct.len() >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -583,14 +601,18 @@ where FI: Fn(DocumentId) -> bool,
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
raw_documents_processed.push(document);
|
||||
if raw_documents_processed.len() == range.len() { break }
|
||||
if raw_documents_processed.len() == range.len() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if start_processing.elapsed() > timeout { break }
|
||||
if start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -611,20 +633,20 @@ mod tests {
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fst::{Set, IntoStreamer};
|
||||
use fst::{IntoStreamer, Set};
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use sdset::SetBuf;
|
||||
use tempfile::TempDir;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::database::Database;
|
||||
use crate::DocIndex;
|
||||
use crate::store::Index;
|
||||
use crate::DocIndex;
|
||||
|
||||
fn set_from_stream<'f, I, S>(stream: I) -> Set
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>,
|
||||
S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>,
|
||||
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(stream).unwrap();
|
||||
@ -687,14 +709,23 @@ mod tests {
|
||||
|
||||
let word = word.to_lowercase();
|
||||
|
||||
let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() {
|
||||
let alternatives = match self
|
||||
.index
|
||||
.synonyms
|
||||
.synonyms(&writer, word.as_bytes())
|
||||
.unwrap()
|
||||
{
|
||||
Some(alternatives) => alternatives,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let new = sdset_into_fstset(&new);
|
||||
let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union());
|
||||
self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap();
|
||||
let new_alternatives =
|
||||
set_from_stream(alternatives.op().add(new.into_stream()).r#union());
|
||||
self.index
|
||||
.synonyms
|
||||
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
|
||||
.unwrap();
|
||||
|
||||
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
|
||||
Some(synonyms) => synonyms,
|
||||
@ -702,14 +733,17 @@ mod tests {
|
||||
};
|
||||
|
||||
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
|
||||
self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap();
|
||||
self.index
|
||||
.main
|
||||
.put_synonyms_fst(&mut writer, &synonyms_fst)
|
||||
.unwrap();
|
||||
|
||||
writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase {
|
||||
fn from_iter<I: IntoIterator<Item=(&'a str, &'a [DocIndex])>>(iter: I) -> Self {
|
||||
fn from_iter<I: IntoIterator<Item = (&'a str, &'a [DocIndex])>>(iter: I) -> Self {
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let database = Database::open_or_create(&tempdir).unwrap();
|
||||
let index = database.create_index("default").unwrap();
|
||||
@ -724,7 +758,10 @@ mod tests {
|
||||
for (word, indexes) in iter {
|
||||
let word = word.to_lowercase().into_bytes();
|
||||
words_fst.insert(word.clone());
|
||||
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
||||
postings_lists
|
||||
.entry(word)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend_from_slice(indexes);
|
||||
for idx in indexes {
|
||||
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
|
||||
}
|
||||
@ -736,31 +773,33 @@ mod tests {
|
||||
|
||||
for (word, postings_list) in postings_lists {
|
||||
let postings_list = SetBuf::from_dirty(postings_list);
|
||||
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
||||
index
|
||||
.postings_lists
|
||||
.put_postings_list(&mut writer, &word, &postings_list)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
for ((docid, attr, _), count) in fields_counts {
|
||||
let prev = index.documents_fields_counts
|
||||
.document_field_count(
|
||||
&mut writer,
|
||||
docid,
|
||||
SchemaAttr(attr),
|
||||
).unwrap();
|
||||
let prev = index
|
||||
.documents_fields_counts
|
||||
.document_field_count(&mut writer, docid, SchemaAttr(attr))
|
||||
.unwrap();
|
||||
|
||||
let prev = prev.unwrap_or(0);
|
||||
|
||||
index.documents_fields_counts
|
||||
.put_document_field_count(
|
||||
&mut writer,
|
||||
docid,
|
||||
SchemaAttr(attr),
|
||||
prev + count,
|
||||
).unwrap();
|
||||
index
|
||||
.documents_fields_counts
|
||||
.put_document_field_count(&mut writer, docid, SchemaAttr(attr), prev + count)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
writer.commit().unwrap();
|
||||
|
||||
TempDatabase { database, index, _tempdir: tempdir }
|
||||
TempDatabase {
|
||||
database,
|
||||
index,
|
||||
_tempdir: tempdir,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -768,8 +807,8 @@ mod tests {
|
||||
fn simple() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("iphone", &[doc_char_index(0, 0, 0)][..]),
|
||||
("from", &[doc_char_index(0, 1, 1)][..]),
|
||||
("apple", &[doc_char_index(0, 2, 2)][..]),
|
||||
("from", &[doc_char_index(0, 1, 1)][..]),
|
||||
("apple", &[doc_char_index(0, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
@ -791,9 +830,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn simple_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
@ -825,9 +862,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn prefix_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
|
||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||
@ -872,9 +907,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn levenshtein_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
]);
|
||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||
|
||||
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||
|
||||
@ -907,9 +940,9 @@ mod tests {
|
||||
#[test]
|
||||
fn harder_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
("hello", &[doc_index(0, 0)][..]),
|
||||
("bonjour", &[doc_index(1, 3)]),
|
||||
("salut", &[doc_index(2, 5)]),
|
||||
("salut", &[doc_index(2, 5)]),
|
||||
]);
|
||||
|
||||
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
|
||||
@ -987,17 +1020,22 @@ mod tests {
|
||||
/// Unique word has multi-word synonyms
|
||||
fn unique_to_multiword_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("subway", &[doc_char_index(0, 3, 3)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(1, 1, 1)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
store.add_synonym(
|
||||
"NY",
|
||||
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||
);
|
||||
store.add_synonym(
|
||||
"NYC",
|
||||
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||
);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
@ -1056,20 +1094,18 @@ mod tests {
|
||||
#[test]
|
||||
fn unique_to_multiword_synonyms_words_proximity() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("subway", &[doc_char_index(0, 3, 3)][..]),
|
||||
|
||||
("york", &[doc_char_index(1, 0, 0)][..]),
|
||||
("new", &[doc_char_index(1, 1, 1)][..]),
|
||||
("york", &[doc_char_index(1, 0, 0)][..]),
|
||||
("new", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
|
||||
("NY", &[doc_char_index(2, 0, 0)][..]),
|
||||
("NY", &[doc_char_index(2, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(2, 1, 1)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
@ -1120,11 +1156,10 @@ mod tests {
|
||||
#[test]
|
||||
fn unique_to_multiword_synonyms_cumulative_word_index() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||
|
||||
("new", &[doc_char_index(1, 0, 0)][..]),
|
||||
("york", &[doc_char_index(1, 1, 1)][..]),
|
||||
("new", &[doc_char_index(1, 0, 0)][..]),
|
||||
("york", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
@ -1175,20 +1210,25 @@ mod tests {
|
||||
/// Unique word has multi-word synonyms
|
||||
fn harder_unique_to_multiword_synonyms_one() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("subway", &[doc_char_index(0, 4, 4)][..]),
|
||||
("broken", &[doc_char_index(0, 5, 5)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("subway", &[doc_char_index(0, 4, 4)][..]),
|
||||
("broken", &[doc_char_index(0, 5, 5)][..]),
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
store.add_synonym(
|
||||
"NY",
|
||||
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||
);
|
||||
store.add_synonym(
|
||||
"NYC",
|
||||
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||
);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
@ -1249,21 +1289,26 @@ mod tests {
|
||||
/// Unique word has multi-word synonyms
|
||||
fn even_harder_unique_to_multiword_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||
("underground", &[doc_char_index(0, 4, 4)][..]),
|
||||
("train", &[doc_char_index(0, 5, 5)][..]),
|
||||
("broken", &[doc_char_index(0, 6, 6)][..]),
|
||||
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
("train", &[doc_char_index(0, 5, 5)][..]),
|
||||
("broken", &[doc_char_index(0, 6, 6)][..]),
|
||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
||||
store.add_synonym(
|
||||
"NY",
|
||||
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||
);
|
||||
store.add_synonym(
|
||||
"NYC",
|
||||
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||
);
|
||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||
|
||||
let env = &store.database.env;
|
||||
@ -1330,30 +1375,36 @@ mod tests {
|
||||
/// Multi-word has multi-word synonyms
|
||||
fn multiword_to_multiword_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||
|
||||
("NYC", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||
|
||||
("new", &[doc_char_index(2, 0, 0)][..]),
|
||||
("york", &[doc_char_index(2, 1, 1)][..]),
|
||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||
("NYC", &[doc_char_index(1, 0, 0)][..]),
|
||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||
("new", &[doc_char_index(2, 0, 0)][..]),
|
||||
("york", &[doc_char_index(2, 1, 1)][..]),
|
||||
("underground", &[doc_char_index(2, 2, 2)][..]),
|
||||
("train", &[doc_char_index(2, 3, 3)][..]),
|
||||
("broken", &[doc_char_index(2, 4, 4)][..]),
|
||||
("train", &[doc_char_index(2, 3, 3)][..]),
|
||||
("broken", &[doc_char_index(2, 4, 4)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ]));
|
||||
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ]));
|
||||
store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ]));
|
||||
store.add_synonym(
|
||||
"new york",
|
||||
SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]),
|
||||
);
|
||||
store.add_synonym(
|
||||
"new york city",
|
||||
SetBuf::from_dirty(vec!["NYC", "NY", "new york"]),
|
||||
);
|
||||
store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"]));
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap();
|
||||
let results = builder
|
||||
.query(&reader, "new york underground train broken", 0..20)
|
||||
.unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||
@ -1390,7 +1441,9 @@ mod tests {
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap();
|
||||
let results = builder
|
||||
.query(&reader, "new york city underground train broken", 0..20)
|
||||
.unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||
@ -1436,14 +1489,14 @@ mod tests {
|
||||
#[test]
|
||||
fn intercrossed_multiword_synonyms() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("new", &[doc_index(0, 0)][..]),
|
||||
("york", &[doc_index(0, 1)][..]),
|
||||
("big", &[doc_index(0, 2)][..]),
|
||||
("city", &[doc_index(0, 3)][..]),
|
||||
("new", &[doc_index(0, 0)][..]),
|
||||
("york", &[doc_index(0, 1)][..]),
|
||||
("big", &[doc_index(0, 2)][..]),
|
||||
("city", &[doc_index(0, 3)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ]));
|
||||
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ]));
|
||||
store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"]));
|
||||
store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"]));
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
@ -1469,16 +1522,14 @@ mod tests {
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("NY", &[doc_index(0, 0)][..]),
|
||||
("city", &[doc_index(0, 1)][..]),
|
||||
("NY", &[doc_index(0, 0)][..]),
|
||||
("city", &[doc_index(0, 1)][..]),
|
||||
("subway", &[doc_index(0, 2)][..]),
|
||||
|
||||
("NY", &[doc_index(1, 0)][..]),
|
||||
("NY", &[doc_index(1, 0)][..]),
|
||||
("subway", &[doc_index(1, 1)][..]),
|
||||
|
||||
("NY", &[doc_index(2, 0)][..]),
|
||||
("york", &[doc_index(2, 1)][..]),
|
||||
("city", &[doc_index(2, 2)][..]),
|
||||
("NY", &[doc_index(2, 0)][..]),
|
||||
("york", &[doc_index(2, 1)][..]),
|
||||
("city", &[doc_index(2, 2)][..]),
|
||||
("subway", &[doc_index(2, 3)][..]),
|
||||
]);
|
||||
|
||||
@ -1525,20 +1576,22 @@ mod tests {
|
||||
#[test]
|
||||
fn cumulative_word_indices() {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("NYC", &[doc_index(0, 0)][..]),
|
||||
("long", &[doc_index(0, 1)][..]),
|
||||
("NYC", &[doc_index(0, 0)][..]),
|
||||
("long", &[doc_index(0, 1)][..]),
|
||||
("subway", &[doc_index(0, 2)][..]),
|
||||
("cool", &[doc_index(0, 3)][..]),
|
||||
("cool", &[doc_index(0, 3)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"]));
|
||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap();
|
||||
let results = builder
|
||||
.query(&reader, "new york city long subway cool ", 0..20)
|
||||
.unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
@ -1560,8 +1613,7 @@ mod tests {
|
||||
let mut store = TempDatabase::from_iter(vec![
|
||||
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
|
||||
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
|
||||
|
||||
("iphone", &[doc_index(1, 0)][..]),
|
||||
("iphone", &[doc_index(1, 0)][..]),
|
||||
]);
|
||||
|
||||
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"]));
|
||||
@ -1624,8 +1676,8 @@ mod tests {
|
||||
#[test]
|
||||
fn simple_concatenation() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("iphone", &[doc_index(0, 0)][..]),
|
||||
("case", &[doc_index(0, 1)][..]),
|
||||
("iphone", &[doc_index(0, 0)][..]),
|
||||
("case", &[doc_index(0, 1)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
|
@ -2,12 +2,11 @@ use std::io::{Read, Write};
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{DocumentId, Number};
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::{TmpMatch, DocumentId, Highlight};
|
||||
use crate::{DocumentId, Highlight, TmpMatch};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
@ -20,7 +20,13 @@ impl RawDocument {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.query_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
@ -41,7 +47,13 @@ impl RawDocument {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.word_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
@ -55,12 +67,32 @@ impl RawDocument {
|
||||
impl fmt::Debug for RawDocument {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("RawDocument {\r\n")?;
|
||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"query_index",
|
||||
self.query_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"distance",
|
||||
self.distance()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"attribute",
|
||||
self.attribute()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"word_index",
|
||||
self.word_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"is_exact",
|
||||
self.is_exact()
|
||||
))?;
|
||||
f.write_str("}")?;
|
||||
Ok(())
|
||||
}
|
||||
@ -70,8 +102,7 @@ pub fn raw_documents_from(
|
||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||
) -> Vec<RawDocument>
|
||||
{
|
||||
) -> Vec<RawDocument> {
|
||||
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
@ -94,10 +125,21 @@ pub fn raw_documents_from(
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
|
||||
let matches = SharedMatches { range, matches: matches.clone() };
|
||||
RawDocument { id, matches, highlights, fields_counts }
|
||||
}).collect()
|
||||
docs_ranges
|
||||
.into_iter()
|
||||
.map(|(id, range, highlights, fields_counts)| {
|
||||
let matches = SharedMatches {
|
||||
range,
|
||||
matches: matches.clone(),
|
||||
};
|
||||
RawDocument {
|
||||
id,
|
||||
matches,
|
||||
highlights,
|
||||
fields_counts,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
|
@ -1,10 +1,10 @@
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use crate::{DocIndex, DocumentId};
|
||||
use deunicode::deunicode_with_tofu;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
||||
use sdset::SetBuf;
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
@ -60,7 +60,9 @@ impl RawIndexer {
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
if !must_continue {
|
||||
break;
|
||||
}
|
||||
|
||||
number_of_words += 1;
|
||||
}
|
||||
@ -70,8 +72,9 @@ impl RawIndexer {
|
||||
}
|
||||
|
||||
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
|
||||
IT: Iterator<Item = &'a str> + Clone,
|
||||
where
|
||||
I: IntoIterator<Item = &'a str, IntoIter = IT>,
|
||||
IT: Iterator<Item = &'a str> + Clone,
|
||||
{
|
||||
// TODO serialize this to one call to the SeqTokenizer loop
|
||||
|
||||
@ -88,14 +91,25 @@ impl RawIndexer {
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
if !must_continue {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
|
||||
if lowercase_text.contains(is_cjk) { return lowercase_text }
|
||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
|
||||
}).collect();
|
||||
let deunicoded: Vec<_> = lowercased
|
||||
.into_iter()
|
||||
.map(|lowercase_text| {
|
||||
if lowercase_text.contains(is_cjk) {
|
||||
return lowercase_text;
|
||||
}
|
||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||
if lowercase_text != deunicoded {
|
||||
deunicoded
|
||||
} else {
|
||||
lowercase_text
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let iter = deunicoded.iter().map(|t| t.as_str());
|
||||
|
||||
for token in SeqTokenizer::new(iter) {
|
||||
@ -108,17 +122,21 @@ impl RawIndexer {
|
||||
&mut self.docs_words,
|
||||
);
|
||||
|
||||
if !must_continue { break }
|
||||
if !must_continue {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> Indexed {
|
||||
let words_doc_indexes = self.words_doc_indexes
|
||||
let words_doc_indexes = self
|
||||
.words_doc_indexes
|
||||
.into_iter()
|
||||
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
||||
.collect();
|
||||
|
||||
let docs_words = self.docs_words
|
||||
let docs_words = self
|
||||
.docs_words
|
||||
.into_iter()
|
||||
.map(|(id, mut words)| {
|
||||
words.sort_unstable();
|
||||
@ -127,7 +145,10 @@ impl RawIndexer {
|
||||
})
|
||||
.collect();
|
||||
|
||||
Indexed { words_doc_indexes, docs_words }
|
||||
Indexed {
|
||||
words_doc_indexes,
|
||||
docs_words,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -138,16 +159,20 @@ fn index_token(
|
||||
word_limit: usize,
|
||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||
) -> bool
|
||||
{
|
||||
if token.word_index >= word_limit { return false }
|
||||
) -> bool {
|
||||
if token.word_index >= word_limit {
|
||||
return false;
|
||||
}
|
||||
|
||||
match token_to_docindex(id, attr, token) {
|
||||
Some(docindex) => {
|
||||
let word = Vec::from(token.word);
|
||||
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
|
||||
words_doc_indexes
|
||||
.entry(word.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(docindex);
|
||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||
},
|
||||
}
|
||||
None => return false,
|
||||
}
|
||||
|
||||
@ -183,7 +208,9 @@ mod tests {
|
||||
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||
indexer.index_text(docid, attr, text);
|
||||
|
||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
@ -191,7 +218,9 @@ mod tests {
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
|
||||
// with the ugly apostrophe...
|
||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"l’éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -203,7 +232,9 @@ mod tests {
|
||||
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
||||
indexer.index_text_seq(docid, attr, text);
|
||||
|
||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
||||
let Indexed {
|
||||
words_doc_indexes, ..
|
||||
} = indexer.build();
|
||||
|
||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||
@ -211,6 +242,8 @@ mod tests {
|
||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||
|
||||
// with the ugly apostrophe...
|
||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
||||
assert!(words_doc_indexes
|
||||
.get(&"l’éteindre".to_owned().into_bytes())
|
||||
.is_some());
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,10 @@ pub struct ReorderedAttrs {
|
||||
|
||||
impl ReorderedAttrs {
|
||||
pub fn new() -> ReorderedAttrs {
|
||||
ReorderedAttrs { count: 0, reorders: Vec::new() }
|
||||
ReorderedAttrs {
|
||||
count: 0,
|
||||
reorders: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_attribute(&mut self, attribute: u16) {
|
||||
|
@ -77,13 +77,18 @@ impl ser::Serializer for ConvertToNumber {
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -91,25 +96,29 @@ impl ser::Serializer for ConvertToNumber {
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "unit struct" })
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "unit variant" })
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
@ -119,15 +128,20 @@ impl ser::Serializer for ConvertToNumber {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnrankableType { type_name: "sequence" })
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
@ -137,10 +151,11 @@ impl ser::Serializer for ConvertToNumber {
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
@ -148,10 +163,11 @@ impl ser::Serializer for ConvertToNumber {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
@ -161,10 +177,11 @@ impl ser::Serializer for ConvertToNumber {
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
@ -172,9 +189,10 @@ impl ser::Serializer for ConvertToNumber {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnrankableType { type_name: "struct variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnrankableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use super::SerializerError;
|
||||
|
||||
@ -17,7 +17,9 @@ impl ser::Serializer for ConvertToString {
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "boolean" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "boolean",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
@ -73,13 +75,18 @@ impl ser::Serializer for ConvertToString {
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -87,25 +94,29 @@ impl ser::Serializer for ConvertToString {
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
@ -115,15 +126,20 @@ impl ser::Serializer for ConvertToString {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
@ -133,10 +149,11 @@ impl ser::Serializer for ConvertToString {
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
@ -144,10 +161,11 @@ impl ser::Serializer for ConvertToString {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
@ -157,10 +175,11 @@ impl ser::Serializer for ConvertToString {
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
@ -168,9 +187,10 @@ impl ser::Serializer for ConvertToString {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +1,12 @@
|
||||
use std::collections::HashSet;
|
||||
use std::io::Cursor;
|
||||
use std::{fmt, error::Error};
|
||||
use std::{error::Error, fmt};
|
||||
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
use serde_json::Deserializer as SerdeJsonDeserializer;
|
||||
use serde_json::de::IoRead as SerdeJsonIoRead;
|
||||
use serde::{de, forward_to_deserialize_any};
|
||||
use serde_json::de::IoRead as SerdeJsonIoRead;
|
||||
use serde_json::Deserializer as SerdeJsonDeserializer;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
|
||||
use crate::store::DocumentsFields;
|
||||
use crate::DocumentId;
|
||||
@ -60,7 +60,8 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
||||
type Error = DeserializerError;
|
||||
|
||||
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
self.deserialize_map(visitor)
|
||||
}
|
||||
@ -72,16 +73,21 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
||||
}
|
||||
|
||||
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
let mut error = None;
|
||||
|
||||
let iter = self.documents_fields
|
||||
let iter = self
|
||||
.documents_fields
|
||||
.document_fields(self.reader, self.document_id)?
|
||||
.filter_map(|result| {
|
||||
let (attr, value) = match result {
|
||||
Ok(value) => value,
|
||||
Err(e) => { error = Some(e); return None },
|
||||
Err(e) => {
|
||||
error = Some(e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let is_displayed = self.schema.props(attr).is_displayed();
|
||||
@ -99,7 +105,9 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
||||
});
|
||||
|
||||
let map_deserializer = de::value::MapDeserializer::new(iter);
|
||||
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
|
||||
let result = visitor
|
||||
.visit_map(map_deserializer)
|
||||
.map_err(DeserializerError::from);
|
||||
|
||||
match error.take() {
|
||||
Some(error) => Err(error.into()),
|
||||
@ -122,7 +130,8 @@ impl<'de> de::Deserializer<'de> for Value {
|
||||
type Error = SerdeJsonError;
|
||||
|
||||
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
||||
where V: de::Visitor<'de>
|
||||
where
|
||||
V: de::Visitor<'de>,
|
||||
{
|
||||
self.0.deserialize_any(visitor)
|
||||
}
|
||||
|
@ -5,13 +5,14 @@ use serde::{ser, Serialize};
|
||||
use serde_json::Value;
|
||||
use siphasher::sip::SipHasher;
|
||||
|
||||
use super::{SerializerError, ConvertToString};
|
||||
use super::{ConvertToString, SerializerError};
|
||||
|
||||
pub fn extract_document_id<D>(
|
||||
identifier: &str,
|
||||
document: &D,
|
||||
) -> Result<Option<DocumentId>, SerializerError>
|
||||
where D: serde::Serialize,
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let serializer = ExtractDocumentId { identifier };
|
||||
document.serialize(serializer)
|
||||
@ -77,13 +78,18 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -91,25 +97,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
@ -119,15 +129,20 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
@ -137,10 +152,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
@ -148,10 +164,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
@ -167,9 +184,8 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
let serializer = ExtractDocumentIdStructSerializer {
|
||||
identifier: self.identifier,
|
||||
document_id: None,
|
||||
@ -183,10 +199,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -201,7 +218,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
@ -209,7 +227,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
@ -218,9 +237,11 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
where
|
||||
K: Serialize,
|
||||
V: Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
@ -252,9 +273,10 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
if self.identifier == key {
|
||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||
|
@ -2,9 +2,9 @@ use meilidb_schema::SchemaAttr;
|
||||
use serde::ser;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::{ConvertToString, SerializerError};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use super::{SerializerError, ConvertToString};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Indexer<'a> {
|
||||
pub attribute: SchemaAttr,
|
||||
@ -24,7 +24,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "boolean" })
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "boolean",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||
@ -83,7 +85,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
|
||||
let number_of_words = self
|
||||
.indexer
|
||||
.index_text(self.document_id, self.attribute, text);
|
||||
Ok(Some(number_of_words))
|
||||
}
|
||||
|
||||
@ -92,14 +96,19 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "Option" })
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
|
||||
let number_of_words = self
|
||||
.indexer
|
||||
.index_text(self.document_id, self.attribute, &text);
|
||||
Ok(Some(number_of_words))
|
||||
}
|
||||
|
||||
@ -108,25 +117,29 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType { type_name: "unit struct" })
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "unit variant" })
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
@ -136,11 +149,14 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
@ -168,10 +184,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
@ -179,10 +196,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
@ -199,10 +217,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
@ -210,10 +229,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnindexableType { type_name: "struct variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnindexableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -229,7 +249,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
@ -238,7 +259,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@ -255,7 +277,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = key.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
@ -263,7 +286,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
@ -272,7 +296,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@ -293,7 +318,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key_text = key.to_owned();
|
||||
let value_text = value.serialize(ConvertToString)?;
|
||||
@ -304,7 +330,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@ -321,7 +348,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.texts.push(text);
|
||||
@ -330,7 +358,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
let texts = self.texts.iter().map(String::as_str);
|
||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
||||
self.indexer
|
||||
.index_text_seq(self.document_id, self.attribute, texts);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
@ -15,19 +15,19 @@ mod extract_document_id;
|
||||
mod indexer;
|
||||
mod serializer;
|
||||
|
||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
|
||||
pub use self::convert_to_string::ConvertToString;
|
||||
pub use self::convert_to_number::ConvertToNumber;
|
||||
pub use self::convert_to_string::ConvertToString;
|
||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
|
||||
pub use self::indexer::Indexer;
|
||||
pub use self::serializer::Serializer;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{fmt, error::Error};
|
||||
use std::{error::Error, fmt};
|
||||
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
use serde::ser;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
|
||||
use crate::{DocumentId, ParseNumberError};
|
||||
|
||||
@ -55,24 +55,24 @@ impl fmt::Display for SerializerError {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
f.write_str("serialized document does not have an id according to the schema")
|
||||
},
|
||||
}
|
||||
SerializerError::InvalidDocumentIdType => {
|
||||
f.write_str("document identifier can only be of type string or number")
|
||||
},
|
||||
}
|
||||
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
|
||||
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
|
||||
SerializerError::ParseNumber(e) => {
|
||||
write!(f, "error while trying to parse a number: {}", e)
|
||||
},
|
||||
}
|
||||
SerializerError::UnserializableType { type_name } => {
|
||||
write!(f, "{} is not a serializable type", type_name)
|
||||
},
|
||||
}
|
||||
SerializerError::UnindexableType { type_name } => {
|
||||
write!(f, "{} is not an indexable type", type_name)
|
||||
},
|
||||
}
|
||||
SerializerError::UnrankableType { type_name } => {
|
||||
write!(f, "{} types can not be used for ranking", type_name)
|
||||
},
|
||||
}
|
||||
SerializerError::Custom(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +1,12 @@
|
||||
use std::collections::HashMap;
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use serde::ser;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{DocumentId, RankedMap};
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::RamDocumentStore;
|
||||
use crate::{DocumentId, RankedMap};
|
||||
|
||||
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
|
||||
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
|
||||
|
||||
pub struct Serializer<'a> {
|
||||
pub schema: &'a Schema,
|
||||
@ -55,13 +55,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "Option",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -69,25 +74,29 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
||||
_variant: &'static str,
|
||||
) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "unit variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
@ -97,15 +106,20 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
_value: &T,
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "newtype variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "sequence",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
@ -115,10 +129,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple struct",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
@ -126,10 +141,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "tuple variant",
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
@ -147,9 +163,8 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
@ -165,10 +180,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct variant",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -187,7 +203,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
self.current_key_name = Some(key);
|
||||
@ -195,7 +212,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
@ -206,7 +224,9 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where K: ser::Serialize, V: ser::Serialize,
|
||||
where
|
||||
K: ser::Serialize,
|
||||
V: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
@ -245,7 +265,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
serialize_value(
|
||||
self.schema,
|
||||
@ -274,7 +295,8 @@ fn serialize_value<T: ?Sized>(
|
||||
key: &str,
|
||||
value: &T,
|
||||
) -> Result<(), SerializerError>
|
||||
where T: ser::Serialize,
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
if let Some(attribute) = schema.attribute(key) {
|
||||
let props = schema.props(attribute);
|
||||
@ -283,7 +305,11 @@ where T: ser::Serialize,
|
||||
document_store.set_document_field(document_id, attribute, serialized);
|
||||
|
||||
if props.is_indexed() {
|
||||
let indexer = Indexer { attribute, indexer, document_id };
|
||||
let indexer = Indexer {
|
||||
attribute,
|
||||
indexer,
|
||||
document_id,
|
||||
};
|
||||
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
use zlmdb::types::{OwnedType, ByteSlice};
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::DocumentId;
|
||||
use super::BEU64;
|
||||
use crate::DocumentId;
|
||||
use std::sync::Arc;
|
||||
use zlmdb::types::{ByteSlice, OwnedType};
|
||||
use zlmdb::Result as ZResult;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocsWords {
|
||||
@ -15,8 +15,7 @@ impl DocsWords {
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
document_id: DocumentId,
|
||||
words: &fst::Set,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
let bytes = words.as_fst().as_bytes();
|
||||
self.docs_words.put(writer, &document_id, bytes)
|
||||
@ -26,8 +25,7 @@ impl DocsWords {
|
||||
&self,
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<bool>
|
||||
{
|
||||
) -> ZResult<bool> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
self.docs_words.delete(writer, &document_id)
|
||||
}
|
||||
@ -36,8 +34,7 @@ impl DocsWords {
|
||||
&self,
|
||||
reader: &zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<Option<fst::Set>>
|
||||
{
|
||||
) -> ZResult<Option<fst::Set>> {
|
||||
let document_id = BEU64::new(document_id.0);
|
||||
match self.docs_words.get(reader, &document_id)? {
|
||||
Some(bytes) => {
|
||||
@ -45,7 +42,7 @@ impl DocsWords {
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use zlmdb::types::{OwnedType, ByteSlice};
|
||||
use zlmdb::types::{ByteSlice, OwnedType};
|
||||
use zlmdb::Result as ZResult;
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::DocumentAttrKey;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFields {
|
||||
@ -17,8 +17,7 @@ impl DocumentsFields {
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: &[u8],
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields.put(writer, &key, value)
|
||||
}
|
||||
@ -27,8 +26,7 @@ impl DocumentsFields {
|
||||
&self,
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<usize>
|
||||
{
|
||||
) -> ZResult<usize> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
self.documents_fields.delete_range(writer, start..=end)
|
||||
@ -39,8 +37,7 @@ impl DocumentsFields {
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> ZResult<Option<&'txn [u8]>>
|
||||
{
|
||||
) -> ZResult<Option<&'txn [u8]>> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields.get(reader, &key)
|
||||
}
|
||||
@ -49,8 +46,7 @@ impl DocumentsFields {
|
||||
&self,
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<DocumentFieldsIter<'txn>>
|
||||
{
|
||||
) -> ZResult<DocumentFieldsIter<'txn>> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
let iter = self.documents_fields.range(reader, start..=end)?;
|
||||
@ -70,7 +66,7 @@ impl<'txn> Iterator for DocumentFieldsIter<'txn> {
|
||||
Some(Ok((key, bytes))) => {
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((attr, bytes)))
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e.into())),
|
||||
None => None,
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
use super::DocumentAttrKey;
|
||||
use crate::DocumentId;
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use zlmdb::types::OwnedType;
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::DocumentId;
|
||||
use super::DocumentAttrKey;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFieldsCounts {
|
||||
@ -16,8 +16,7 @@ impl DocumentsFieldsCounts {
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: u64,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields_counts.put(writer, &key, &value)
|
||||
}
|
||||
@ -26,11 +25,11 @@ impl DocumentsFieldsCounts {
|
||||
&self,
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<usize>
|
||||
{
|
||||
) -> ZResult<usize> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
self.documents_fields_counts.delete_range(writer, start..=end)
|
||||
self.documents_fields_counts
|
||||
.delete_range(writer, start..=end)
|
||||
}
|
||||
|
||||
pub fn document_field_count(
|
||||
@ -38,8 +37,7 @@ impl DocumentsFieldsCounts {
|
||||
reader: &zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> ZResult<Option<u64>>
|
||||
{
|
||||
) -> ZResult<Option<u64>> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
match self.documents_fields_counts.get(reader, &key)? {
|
||||
Some(count) => Ok(Some(count)),
|
||||
@ -51,8 +49,7 @@ impl DocumentsFieldsCounts {
|
||||
&self,
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
) -> ZResult<DocumentFieldsCountsIter<'txn>>
|
||||
{
|
||||
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
|
||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||
let iter = self.documents_fields_counts.range(reader, start..=end)?;
|
||||
@ -62,17 +59,18 @@ impl DocumentsFieldsCounts {
|
||||
pub fn documents_ids<'txn>(
|
||||
&self,
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
) -> ZResult<DocumentsIdsIter<'txn>>
|
||||
{
|
||||
) -> ZResult<DocumentsIdsIter<'txn>> {
|
||||
let iter = self.documents_fields_counts.iter(reader)?;
|
||||
Ok(DocumentsIdsIter { last_seen_id: None, iter })
|
||||
Ok(DocumentsIdsIter {
|
||||
last_seen_id: None,
|
||||
iter,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn all_documents_fields_counts<'txn>(
|
||||
&self,
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>>
|
||||
{
|
||||
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
|
||||
let iter = self.documents_fields_counts.iter(reader)?;
|
||||
Ok(AllDocumentsFieldsCountsIter { iter })
|
||||
}
|
||||
@ -90,7 +88,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
|
||||
Some(Ok((key, count))) => {
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((attr, count)))
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e.into())),
|
||||
None => None,
|
||||
}
|
||||
@ -112,9 +110,9 @@ impl Iterator for DocumentsIdsIter<'_> {
|
||||
let document_id = DocumentId(key.docid.get());
|
||||
if Some(document_id) != self.last_seen_id {
|
||||
self.last_seen_id = Some(document_id);
|
||||
return Some(Ok(document_id))
|
||||
return Some(Ok(document_id));
|
||||
}
|
||||
},
|
||||
}
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
}
|
||||
}
|
||||
@ -135,7 +133,7 @@ impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
|
||||
let docid = DocumentId(key.docid.get());
|
||||
let attr = SchemaAttr(key.attr.get());
|
||||
Some(Ok((docid, attr, count)))
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e.into())),
|
||||
None => None,
|
||||
}
|
||||
|
@ -1,15 +1,15 @@
|
||||
use std::sync::Arc;
|
||||
use meilidb_schema::Schema;
|
||||
use zlmdb::types::{Str, OwnedType, ByteSlice, Serde};
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::RankedMap;
|
||||
use meilidb_schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
|
||||
use zlmdb::Result as ZResult;
|
||||
|
||||
const CUSTOMS_KEY: &str = "customs-key";
|
||||
const CUSTOMS_KEY: &str = "customs-key";
|
||||
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||
const SCHEMA_KEY: &str = "schema";
|
||||
const SYNONYMS_KEY: &str = "synonyms";
|
||||
const WORDS_KEY: &str = "words";
|
||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||
const SCHEMA_KEY: &str = "schema";
|
||||
const SYNONYMS_KEY: &str = "synonyms";
|
||||
const WORDS_KEY: &str = "words";
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Main {
|
||||
@ -29,13 +29,14 @@ impl Main {
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
|
||||
self.main.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
|
||||
self.main
|
||||
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
|
||||
}
|
||||
|
||||
pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
|
||||
@ -43,11 +44,13 @@ impl Main {
|
||||
}
|
||||
|
||||
pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
|
||||
self.main.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
|
||||
self.main
|
||||
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
|
||||
}
|
||||
|
||||
pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
|
||||
self.main.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
|
||||
self.main
|
||||
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
|
||||
}
|
||||
|
||||
pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||
@ -62,28 +65,34 @@ impl Main {
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_number_of_documents<F>(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
|
||||
where F: Fn(u64) -> u64,
|
||||
where
|
||||
F: Fn(u64) -> u64,
|
||||
{
|
||||
let new = self.number_of_documents(writer).map(f)?;
|
||||
self.main.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
|
||||
self.main
|
||||
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
|
||||
Ok(new)
|
||||
}
|
||||
|
||||
pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
|
||||
match self.main.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? {
|
||||
match self
|
||||
.main
|
||||
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
|
||||
{
|
||||
Some(value) => Ok(value),
|
||||
None => Ok(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
|
||||
self.main.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
|
||||
self.main
|
||||
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
|
||||
}
|
||||
|
||||
pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
|
||||
|
@ -8,8 +8,10 @@ mod updates;
|
||||
mod updates_results;
|
||||
|
||||
pub use self::docs_words::DocsWords;
|
||||
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
|
||||
pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter};
|
||||
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||
pub use self::documents_fields_counts::{
|
||||
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
||||
};
|
||||
pub use self::main::Main;
|
||||
pub use self::postings_lists::PostingsLists;
|
||||
pub use self::synonyms::Synonyms;
|
||||
@ -25,19 +27,24 @@ use zlmdb::Result as ZResult;
|
||||
|
||||
use crate::criterion::Criteria;
|
||||
use crate::serde::Deserializer;
|
||||
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
||||
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
||||
|
||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
#[derive(AsBytes, FromBytes)]
|
||||
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||
#[repr(C)]
|
||||
pub struct DocumentAttrKey { docid: BEU64, attr: BEU16 }
|
||||
pub struct DocumentAttrKey {
|
||||
docid: BEU64,
|
||||
attr: BEU16,
|
||||
}
|
||||
|
||||
impl DocumentAttrKey {
|
||||
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
|
||||
DocumentAttrKey { docid: BEU64::new(docid.0), attr: BEU16::new(attr.0) }
|
||||
DocumentAttrKey {
|
||||
docid: BEU64::new(docid.0),
|
||||
attr: BEU16::new(attr.0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -93,13 +100,15 @@ impl Index {
|
||||
reader: &zlmdb::RoTxn,
|
||||
attributes: Option<&HashSet<&str>>,
|
||||
document_id: DocumentId,
|
||||
) -> MResult<Option<T>>
|
||||
{
|
||||
) -> MResult<Option<T>> {
|
||||
let schema = self.main.schema(reader)?;
|
||||
let schema = schema.ok_or(Error::SchemaMissing)?;
|
||||
|
||||
let attributes = match attributes {
|
||||
Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(),
|
||||
Some(attributes) => attributes
|
||||
.into_iter()
|
||||
.map(|name| schema.attribute(name))
|
||||
.collect(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
@ -121,9 +130,10 @@ impl Index {
|
||||
reader: &zlmdb::RoTxn,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> MResult<Option<T>>
|
||||
{
|
||||
let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?;
|
||||
) -> MResult<Option<T>> {
|
||||
let bytes = self
|
||||
.documents_fields
|
||||
.document_attribute(reader, document_id, attribute)?;
|
||||
match bytes {
|
||||
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
|
||||
None => Ok(None),
|
||||
@ -183,14 +193,8 @@ impl Index {
|
||||
&self,
|
||||
reader: &zlmdb::RoTxn,
|
||||
update_id: u64,
|
||||
) -> MResult<update::UpdateStatus>
|
||||
{
|
||||
update::update_status(
|
||||
reader,
|
||||
self.updates,
|
||||
self.updates_results,
|
||||
update_id,
|
||||
)
|
||||
) -> MResult<update::UpdateStatus> {
|
||||
update::update_status(reader, self.updates, self.updates_results, update_id)
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> QueryBuilder {
|
||||
@ -205,8 +209,7 @@ impl Index {
|
||||
pub fn query_builder_with_criteria<'c, 'f, 'd>(
|
||||
&self,
|
||||
criteria: Criteria<'c>,
|
||||
) -> QueryBuilder<'c, 'f, 'd>
|
||||
{
|
||||
) -> QueryBuilder<'c, 'f, 'd> {
|
||||
QueryBuilder::with_criteria(
|
||||
self.main,
|
||||
self.postings_lists,
|
||||
@ -221,8 +224,7 @@ pub fn create(
|
||||
env: &zlmdb::Env,
|
||||
name: &str,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> MResult<Index>
|
||||
{
|
||||
) -> MResult<Index> {
|
||||
// create all the store names
|
||||
let main_name = main_name(name);
|
||||
let postings_lists_name = postings_lists_name(name);
|
||||
@ -247,7 +249,9 @@ pub fn create(
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
updates: Updates { updates },
|
||||
@ -260,8 +264,7 @@ pub fn open(
|
||||
env: &zlmdb::Env,
|
||||
name: &str,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> MResult<Option<Index>>
|
||||
{
|
||||
) -> MResult<Option<Index>> {
|
||||
// create all the store names
|
||||
let main_name = main_name(name);
|
||||
let postings_lists_name = postings_lists_name(name);
|
||||
@ -310,7 +313,9 @@ pub fn open(
|
||||
main: Main { main },
|
||||
postings_lists: PostingsLists { postings_lists },
|
||||
documents_fields: DocumentsFields { documents_fields },
|
||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||
documents_fields_counts: DocumentsFieldsCounts {
|
||||
documents_fields_counts,
|
||||
},
|
||||
synonyms: Synonyms { synonyms },
|
||||
docs_words: DocsWords { docs_words },
|
||||
updates: Updates { updates },
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
use crate::DocIndex;
|
||||
use sdset::{Set, SetBuf};
|
||||
use std::borrow::Cow;
|
||||
use zlmdb::types::{ByteSlice, CowSlice};
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct PostingsLists {
|
||||
@ -15,8 +15,7 @@ impl PostingsLists {
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
word: &[u8],
|
||||
words_indexes: &Set<DocIndex>,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
self.postings_lists.put(writer, word, words_indexes)
|
||||
}
|
||||
|
||||
@ -28,8 +27,7 @@ impl PostingsLists {
|
||||
&self,
|
||||
reader: &'txn zlmdb::RoTxn,
|
||||
word: &[u8],
|
||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>>
|
||||
{
|
||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
||||
match self.postings_lists.get(reader, word)? {
|
||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
||||
|
@ -13,8 +13,7 @@ impl Synonyms {
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
word: &[u8],
|
||||
synonyms: &fst::Set,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
let bytes = synonyms.as_fst().as_bytes();
|
||||
self.synonyms.put(writer, word, bytes)
|
||||
}
|
||||
@ -30,7 +29,7 @@ impl Synonyms {
|
||||
let bytes = Arc::from(bytes);
|
||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(Some(fst::Set::from(fst)))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
@ -1,13 +1,16 @@
|
||||
use super::BEU64;
|
||||
use crate::update::Update;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use zlmdb::types::OwnedType;
|
||||
use zlmdb::{Result as ZResult, BytesEncode, BytesDecode};
|
||||
use serde::{Serialize, Deserialize};
|
||||
use crate::update::Update;
|
||||
use super::BEU64;
|
||||
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
|
||||
|
||||
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
|
||||
|
||||
impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
|
||||
impl<T> BytesEncode for SerdeJson<T>
|
||||
where
|
||||
T: Serialize,
|
||||
{
|
||||
type EItem = T;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
@ -15,7 +18,10 @@ impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T> where T: Deserialize<'a> + Clone {
|
||||
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
|
||||
where
|
||||
T: Deserialize<'a> + Clone,
|
||||
{
|
||||
type DItem = T;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
@ -56,8 +62,7 @@ impl Updates {
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
update_id: u64,
|
||||
update: &Update,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
// TODO prefer using serde_json?
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates.put(writer, &update_id, update)
|
||||
@ -69,8 +74,8 @@ impl Updates {
|
||||
let key = BEU64::new(update_id);
|
||||
self.updates.delete(writer, &key)?;
|
||||
Ok(Some((update_id, update)))
|
||||
},
|
||||
None => Ok(None)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use super::BEU64;
|
||||
use crate::update::UpdateResult;
|
||||
use zlmdb::types::{OwnedType, Serde};
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::update::UpdateResult;
|
||||
use super::BEU64;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct UpdatesResults {
|
||||
@ -21,8 +21,7 @@ impl UpdatesResults {
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
update_id: u64,
|
||||
update_result: &UpdateResult,
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates_results.put(writer, &update_id, update_result)
|
||||
}
|
||||
@ -31,8 +30,7 @@ impl UpdatesResults {
|
||||
&self,
|
||||
reader: &zlmdb::RoTxn,
|
||||
update_id: u64,
|
||||
) -> ZResult<Option<UpdateResult>>
|
||||
{
|
||||
) -> ZResult<Option<UpdateResult>> {
|
||||
let update_id = BEU64::new(update_id);
|
||||
self.updates_results.get(reader, &update_id)
|
||||
}
|
||||
|
@ -1,13 +1,12 @@
|
||||
use zlmdb::Result as ZResult;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::store;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use zlmdb::Result as ZResult;
|
||||
|
||||
pub fn apply_customs_update(
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
main_store: store::Main,
|
||||
customs: &[u8],
|
||||
) -> ZResult<()>
|
||||
{
|
||||
) -> ZResult<()> {
|
||||
main_store.put_customs(writer, customs)
|
||||
}
|
||||
|
||||
@ -16,8 +15,7 @@ pub fn push_customs_update(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
customs: Vec<u8>,
|
||||
) -> ZResult<u64>
|
||||
{
|
||||
) -> ZResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::Customs(customs);
|
||||
|
@ -1,14 +1,14 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use sdset::{SetOperation, duo::Union};
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::{duo::Union, SetOperation};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
|
||||
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
|
||||
use crate::store;
|
||||
use crate::update::{Update, next_update_id, apply_documents_deletion};
|
||||
use crate::{MResult, Error, RankedMap};
|
||||
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
||||
use crate::{Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsAddition<D> {
|
||||
updates_store: store::Updates,
|
||||
@ -22,8 +22,7 @@ impl<D> DocumentsAddition<D> {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> DocumentsAddition<D>
|
||||
{
|
||||
) -> DocumentsAddition<D> {
|
||||
DocumentsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
@ -37,7 +36,8 @@ impl<D> DocumentsAddition<D> {
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
|
||||
where D: serde::Serialize
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let _ = self.updates_notifier.send(());
|
||||
let update_id = push_documents_addition(
|
||||
@ -51,7 +51,7 @@ impl<D> DocumentsAddition<D> {
|
||||
}
|
||||
|
||||
impl<D> Extend<D> for DocumentsAddition<D> {
|
||||
fn extend<T: IntoIterator<Item=D>>(&mut self, iter: T) {
|
||||
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
@ -61,8 +61,7 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: Vec<D>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
) -> MResult<u64> {
|
||||
let mut values = Vec::with_capacity(addition.len());
|
||||
for add in addition {
|
||||
let vec = serde_json::to_vec(&add)?;
|
||||
@ -87,8 +86,7 @@ pub fn apply_documents_addition(
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
addition: Vec<serde_json::Value>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
) -> MResult<()> {
|
||||
let mut document_ids = HashSet::new();
|
||||
let mut document_store = RamDocumentStore::new();
|
||||
let mut document_fields_counts = HashMap::new();
|
||||
@ -182,7 +180,7 @@ pub fn apply_documents_addition(
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
}
|
||||
None => delta_words,
|
||||
};
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
use std::collections::{HashMap, HashSet, BTreeSet};
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
|
||||
use fst::{SetBuilder, Streamer};
|
||||
use meilidb_schema::Schema;
|
||||
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
|
||||
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
|
||||
|
||||
use crate::{DocumentId, RankedMap, MResult, Error};
|
||||
use crate::serde::extract_document_id;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::store;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{DocumentId, Error, MResult, RankedMap};
|
||||
|
||||
pub struct DocumentsDeletion {
|
||||
updates_store: store::Updates,
|
||||
@ -21,8 +21,7 @@ impl DocumentsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> DocumentsDeletion
|
||||
{
|
||||
) -> DocumentsDeletion {
|
||||
DocumentsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
@ -36,7 +35,8 @@ impl DocumentsDeletion {
|
||||
}
|
||||
|
||||
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
|
||||
where D: serde::Serialize,
|
||||
where
|
||||
D: serde::Serialize,
|
||||
{
|
||||
let identifier = schema.identifier_name();
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
@ -62,7 +62,7 @@ impl DocumentsDeletion {
|
||||
}
|
||||
|
||||
impl Extend<DocumentId> for DocumentsDeletion {
|
||||
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
|
||||
fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
|
||||
self.documents.extend(iter)
|
||||
}
|
||||
}
|
||||
@ -72,8 +72,7 @@ pub fn push_documents_deletion(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::DocumentsDeletion(deletion);
|
||||
@ -91,8 +90,7 @@ pub fn apply_documents_deletion(
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
) -> MResult<()> {
|
||||
let idset = SetBuf::from_dirty(deletion);
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
@ -101,10 +99,17 @@ pub fn apply_documents_deletion(
|
||||
};
|
||||
|
||||
// collect the ranked attributes according to the schema
|
||||
let ranked_attrs: Vec<_> = schema.iter()
|
||||
.filter_map(|(_, attr, prop)| {
|
||||
if prop.is_ranked() { Some(attr) } else { None }
|
||||
})
|
||||
let ranked_attrs: Vec<_> = schema
|
||||
.iter()
|
||||
.filter_map(
|
||||
|(_, attr, prop)| {
|
||||
if prop.is_ranked() {
|
||||
Some(attr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect();
|
||||
|
||||
let mut words_document_ids = HashMap::new();
|
||||
@ -118,7 +123,10 @@ pub fn apply_documents_deletion(
|
||||
let mut stream = words.stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let word = word.to_vec();
|
||||
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
|
||||
words_document_ids
|
||||
.entry(word)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -167,7 +175,7 @@ pub fn apply_documents_deletion(
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
}
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
|
@ -6,21 +6,21 @@ mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
|
||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
|
||||
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
|
||||
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
|
||||
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};
|
||||
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
||||
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
use std::collections::BTreeMap;
|
||||
use std::cmp;
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use log::debug;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zlmdb::Result as ZResult;
|
||||
|
||||
use crate::{store, MResult, DocumentId, RankedMap};
|
||||
use crate::{store, DocumentId, MResult, RankedMap};
|
||||
use meilidb_schema::Schema;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -68,8 +68,7 @@ pub fn update_status(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
update_id: u64,
|
||||
) -> MResult<UpdateStatus>
|
||||
{
|
||||
) -> MResult<UpdateStatus> {
|
||||
match updates_results_store.update_result(reader, update_id)? {
|
||||
Some(result) => Ok(UpdateStatus::Processed(result)),
|
||||
None => {
|
||||
@ -86,8 +85,7 @@ pub fn next_update_id(
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> ZResult<u64>
|
||||
{
|
||||
) -> ZResult<u64> {
|
||||
let last_update_id = updates_store.last_update_id(writer)?;
|
||||
let last_update_id = last_update_id.map(|(n, _)| n);
|
||||
|
||||
@ -100,7 +98,10 @@ pub fn next_update_id(
|
||||
Ok(new_update_id)
|
||||
}
|
||||
|
||||
pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Option<UpdateResult>> {
|
||||
pub fn update_task(
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
index: store::Index,
|
||||
) -> MResult<Option<UpdateResult>> {
|
||||
let (update_id, update) = match index.updates.pop_front(writer)? {
|
||||
Some(value) => value,
|
||||
None => return Ok(None),
|
||||
@ -112,11 +113,13 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
Update::Schema(schema) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::Schema { schema: schema.clone() };
|
||||
let update_type = UpdateType::Schema {
|
||||
schema: schema.clone(),
|
||||
};
|
||||
let result = apply_schema_update(writer, index.main, &schema);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
}
|
||||
Update::Customs(customs) => {
|
||||
let start = Instant::now();
|
||||
|
||||
@ -133,7 +136,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
|
||||
let update_type = UpdateType::DocumentsAddition {
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_addition(
|
||||
writer,
|
||||
@ -147,7 +152,7 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
}
|
||||
Update::DocumentsDeletion(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
@ -156,7 +161,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
|
||||
let update_type = UpdateType::DocumentsDeletion {
|
||||
number: documents.len(),
|
||||
};
|
||||
|
||||
let result = apply_documents_deletion(
|
||||
writer,
|
||||
@ -170,38 +177,35 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
}
|
||||
Update::SynonymsAddition(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
|
||||
let update_type = UpdateType::SynonymsAddition {
|
||||
number: synonyms.len(),
|
||||
};
|
||||
|
||||
let result = apply_synonyms_addition(
|
||||
writer,
|
||||
index.main,
|
||||
index.synonyms,
|
||||
synonyms,
|
||||
);
|
||||
let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
}
|
||||
Update::SynonymsDeletion(synonyms) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
|
||||
let update_type = UpdateType::SynonymsDeletion {
|
||||
number: synonyms.len(),
|
||||
};
|
||||
|
||||
let result = apply_synonyms_deletion(
|
||||
writer,
|
||||
index.main,
|
||||
index.synonyms,
|
||||
synonyms,
|
||||
);
|
||||
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Processed update number {} {:?} {:?}", update_id, update_type, result);
|
||||
debug!(
|
||||
"Processed update number {} {:?} {:?}",
|
||||
update_id, update_type, result
|
||||
);
|
||||
|
||||
let detailed_duration = DetailedDuration { main: duration };
|
||||
let status = UpdateResult {
|
||||
@ -211,7 +215,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
||||
detailed_duration,
|
||||
};
|
||||
|
||||
index.updates_results.put_update_result(writer, update_id, &status)?;
|
||||
index
|
||||
.updates_results
|
||||
.put_update_result(writer, update_id, &status)?;
|
||||
|
||||
Ok(Some(status))
|
||||
}
|
||||
|
@ -1,18 +1,19 @@
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{error::UnsupportedOperation, store, MResult};
|
||||
use meilidb_schema::Schema;
|
||||
use crate::{store, error::UnsupportedOperation, MResult};
|
||||
use crate::update::{Update, next_update_id};
|
||||
|
||||
pub fn apply_schema_update(
|
||||
writer: &mut zlmdb::RwTxn,
|
||||
main_store: store::Main,
|
||||
new_schema: &Schema,
|
||||
) -> MResult<()>
|
||||
{
|
||||
) -> MResult<()> {
|
||||
if let Some(_) = main_store.schema(writer)? {
|
||||
return Err(UnsupportedOperation::SchemaAlreadyExists.into())
|
||||
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
|
||||
}
|
||||
|
||||
main_store.put_schema(writer, new_schema).map_err(Into::into)
|
||||
main_store
|
||||
.put_schema(writer, new_schema)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn push_schema_update(
|
||||
@ -20,8 +21,7 @@ pub fn push_schema_update(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
schema: Schema,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::Schema(schema);
|
||||
|
@ -1,10 +1,10 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsAddition {
|
||||
@ -19,8 +19,7 @@ impl SynonymsAddition {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> SynonymsAddition
|
||||
{
|
||||
) -> SynonymsAddition {
|
||||
SynonymsAddition {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
@ -30,13 +29,17 @@ impl SynonymsAddition {
|
||||
}
|
||||
|
||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: IntoIterator<Item=T>,
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: IntoIterator<Item = T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||
self.synonyms
|
||||
.entry(synonym)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(alternatives);
|
||||
}
|
||||
|
||||
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
|
||||
@ -56,8 +59,7 @@ pub fn push_synonyms_addition(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::SynonymsAddition(addition);
|
||||
@ -71,8 +73,7 @@ pub fn apply_synonyms_addition(
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
addition: BTreeMap<String, Vec<String>>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
) -> MResult<()> {
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
|
||||
for (word, alternatives) in addition {
|
||||
@ -107,7 +108,7 @@ pub fn apply_synonyms_addition(
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
}
|
||||
None => delta_synonyms,
|
||||
};
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::update::{Update, next_update_id};
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult};
|
||||
|
||||
pub struct SynonymsDeletion {
|
||||
@ -20,8 +20,7 @@ impl SynonymsDeletion {
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
updates_notifier: crossbeam_channel::Sender<()>,
|
||||
) -> SynonymsDeletion
|
||||
{
|
||||
) -> SynonymsDeletion {
|
||||
SynonymsDeletion {
|
||||
updates_store,
|
||||
updates_results_store,
|
||||
@ -36,9 +35,10 @@ impl SynonymsDeletion {
|
||||
}
|
||||
|
||||
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item=T>,
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
I: Iterator<Item = T>,
|
||||
{
|
||||
let synonym = normalize_str(synonym.as_ref());
|
||||
let value = self.synonyms.entry(synonym).or_insert(None);
|
||||
@ -66,8 +66,7 @@ pub fn push_synonyms_deletion(
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<u64>
|
||||
{
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
|
||||
let update = Update::SynonymsDeletion(deletion);
|
||||
@ -81,8 +80,7 @@ pub fn apply_synonyms_deletion(
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||
) -> MResult<()>
|
||||
{
|
||||
) -> MResult<()> {
|
||||
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, alternatives) in deletion {
|
||||
@ -98,9 +96,7 @@ pub fn apply_synonyms_deletion(
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
let mut builder = SetBuilder::memory();
|
||||
builder.extend_iter(alternatives).unwrap();
|
||||
builder.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
|
||||
};
|
||||
|
||||
let op = OpBuilder::new()
|
||||
@ -124,7 +120,7 @@ pub fn apply_synonyms_deletion(
|
||||
} else {
|
||||
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
None => {
|
||||
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
||||
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
|
||||
@ -150,7 +146,7 @@ pub fn apply_synonyms_deletion(
|
||||
.into_inner()
|
||||
.and_then(fst::Set::from_bytes)
|
||||
.unwrap()
|
||||
},
|
||||
}
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
|
@ -1,14 +1,26 @@
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::{fmt, u16};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::BitOr;
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, u16};
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use indexmap::IndexMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
|
||||
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
|
||||
pub const DISPLAYED: SchemaProps = SchemaProps {
|
||||
displayed: true,
|
||||
indexed: false,
|
||||
ranked: false,
|
||||
};
|
||||
pub const INDEXED: SchemaProps = SchemaProps {
|
||||
displayed: false,
|
||||
indexed: true,
|
||||
ranked: false,
|
||||
};
|
||||
pub const RANKED: SchemaProps = SchemaProps {
|
||||
displayed: false,
|
||||
indexed: false,
|
||||
ranked: true,
|
||||
};
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct SchemaProps {
|
||||
@ -80,7 +92,13 @@ impl SchemaBuilder {
|
||||
}
|
||||
|
||||
let identifier = self.identifier;
|
||||
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
|
||||
Schema {
|
||||
inner: Arc::new(InnerSchema {
|
||||
identifier,
|
||||
attrs,
|
||||
props,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -100,7 +118,10 @@ impl Schema {
|
||||
fn to_builder(&self) -> SchemaBuilder {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
SchemaBuilder { identifier, attributes }
|
||||
SchemaBuilder {
|
||||
identifier,
|
||||
attributes,
|
||||
}
|
||||
}
|
||||
|
||||
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
|
||||
@ -136,18 +157,18 @@ impl Schema {
|
||||
name
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
|
||||
self.inner.props.iter()
|
||||
.map(move |(name, prop)| {
|
||||
let attr = self.inner.attrs.get(name).unwrap();
|
||||
(name.as_str(), *attr, *prop)
|
||||
})
|
||||
pub fn iter<'a>(&'a self) -> impl Iterator<Item = (&str, SchemaAttr, SchemaProps)> + 'a {
|
||||
self.inner.props.iter().map(move |(name, prop)| {
|
||||
let attr = self.inner.attrs.get(name).unwrap();
|
||||
(name.as_str(), *attr, *prop)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Schema {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: serde::ser::Serializer,
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
{
|
||||
self.to_builder().serialize(serializer)
|
||||
}
|
||||
@ -155,15 +176,15 @@ impl Serialize for Schema {
|
||||
|
||||
impl<'de> Deserialize<'de> for Schema {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: serde::de::Deserializer<'de>,
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let builder = SchemaBuilder::deserialize(deserializer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct SchemaAttr(pub u16);
|
||||
|
||||
impl SchemaAttr {
|
||||
|
@ -1,17 +1,17 @@
|
||||
use std::iter::Peekable;
|
||||
use slice_group_by::StrGroupBy;
|
||||
use self::SeparatorCategory::*;
|
||||
use slice_group_by::StrGroupBy;
|
||||
use std::iter::Peekable;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}')
|
||||
|| (c >= '\u{2f00}' && c <= '\u{2fdf}')
|
||||
|| (c >= '\u{3040}' && c <= '\u{309f}')
|
||||
|| (c >= '\u{30a0}' && c <= '\u{30ff}')
|
||||
|| (c >= '\u{3100}' && c <= '\u{312f}')
|
||||
|| (c >= '\u{3200}' && c <= '\u{32ff}')
|
||||
|| (c >= '\u{3400}' && c <= '\u{4dbf}')
|
||||
|| (c >= '\u{4e00}' && c <= '\u{9fff}')
|
||||
|| (c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
@ -22,7 +22,11 @@ enum SeparatorCategory {
|
||||
|
||||
impl SeparatorCategory {
|
||||
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
|
||||
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
|
||||
if let (Soft, Soft) = (self, other) {
|
||||
Soft
|
||||
} else {
|
||||
Hard
|
||||
}
|
||||
}
|
||||
|
||||
fn to_usize(self) -> usize {
|
||||
@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool {
|
||||
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
||||
match c {
|
||||
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
||||
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u
|
||||
(n + 1, i + c.len_utf8())
|
||||
}
|
||||
|
||||
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
|
||||
pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
|
||||
Tokenizer::new(query).map(|t| t.word)
|
||||
}
|
||||
|
||||
@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> {
|
||||
pub fn new(string: &str) -> Tokenizer {
|
||||
// skip every separator and set `char_index`
|
||||
// to the number of char trimmed
|
||||
let (count, index) = string.char_indices()
|
||||
.take_while(|(_, c)| is_separator(*c))
|
||||
.fold((0, 0), chars_count_index);
|
||||
let (count, index) = string
|
||||
.char_indices()
|
||||
.take_while(|(_, c)| is_separator(*c))
|
||||
.fold((0, 0), chars_count_index);
|
||||
|
||||
Tokenizer {
|
||||
inner: &string[index..],
|
||||
@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> {
|
||||
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
|
||||
|
||||
if !is_str_word(string) {
|
||||
self.word_index += string.chars()
|
||||
.filter_map(classify_separator)
|
||||
.fold(Soft, |a, x| a.merge(x))
|
||||
.to_usize();
|
||||
self.word_index += string
|
||||
.chars()
|
||||
.filter_map(classify_separator)
|
||||
.fold(Soft, |a, x| a.merge(x))
|
||||
.to_usize();
|
||||
self.char_index += count;
|
||||
self.inner = &self.inner[index..];
|
||||
continue;
|
||||
@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> {
|
||||
}
|
||||
|
||||
pub struct SeqTokenizer<'a, I>
|
||||
where I: Iterator<Item=&'a str>,
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
{
|
||||
inner: I,
|
||||
current: Option<Peekable<Tokenizer<'a>>>,
|
||||
@ -162,7 +169,8 @@ where I: Iterator<Item=&'a str>,
|
||||
}
|
||||
|
||||
impl<'a, I> SeqTokenizer<'a, I>
|
||||
where I: Iterator<Item=&'a str>,
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
{
|
||||
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
||||
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
||||
@ -176,7 +184,8 @@ where I: Iterator<Item=&'a str>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
||||
where I: Iterator<Item=&'a str>,
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
{
|
||||
type Item = Token<'a>;
|
||||
|
||||
@ -202,15 +211,15 @@ where I: Iterator<Item=&'a str>,
|
||||
}
|
||||
|
||||
Some(token)
|
||||
},
|
||||
}
|
||||
None => {
|
||||
// no more words in this text we must
|
||||
// start tokenizing the next text
|
||||
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
||||
self.next()
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
// no more texts available
|
||||
None => None,
|
||||
}
|
||||
@ -225,12 +234,26 @@ mod tests {
|
||||
fn easy() {
|
||||
let mut tokenizer = Tokenizer::new("salut");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "salut",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "yo",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
@ -238,19 +261,82 @@ mod tests {
|
||||
fn hard() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "yo",
|
||||
word_index: 0,
|
||||
char_index: 4
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lolo",
|
||||
word_index: 1,
|
||||
char_index: 7
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "aïe",
|
||||
word_index: 9,
|
||||
char_index: 13
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "ouch",
|
||||
word_index: 17,
|
||||
char_index: 18
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "yo",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lolo",
|
||||
word_index: 8,
|
||||
char_index: 5
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "wtf",
|
||||
word_index: 16,
|
||||
char_index: 12
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lol",
|
||||
word_index: 17,
|
||||
char_index: 18
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "aïe",
|
||||
word_index: 25,
|
||||
char_index: 24
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
@ -258,18 +344,74 @@ mod tests {
|
||||
fn hard_long_chars() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "yo",
|
||||
word_index: 0,
|
||||
char_index: 4
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "😂",
|
||||
word_index: 1,
|
||||
char_index: 7
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "aïe",
|
||||
word_index: 9,
|
||||
char_index: 10
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "yo",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lolo",
|
||||
word_index: 8,
|
||||
char_index: 5
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "😱",
|
||||
word_index: 16,
|
||||
char_index: 12
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lol",
|
||||
word_index: 17,
|
||||
char_index: 16
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "😣",
|
||||
word_index: 25,
|
||||
char_index: 22
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
@ -277,19 +419,82 @@ mod tests {
|
||||
fn hard_kanjis() {
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ec4}",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lolilol",
|
||||
word_index: 1,
|
||||
char_index: 1
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ec7}",
|
||||
word_index: 2,
|
||||
char_index: 8
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ec4}",
|
||||
word_index: 0,
|
||||
char_index: 0
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ed3}",
|
||||
word_index: 1,
|
||||
char_index: 1
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ef2}",
|
||||
word_index: 2,
|
||||
char_index: 2
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "lolilol",
|
||||
word_index: 3,
|
||||
char_index: 4
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "hello",
|
||||
word_index: 4,
|
||||
char_index: 14
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
tokenizer.next(),
|
||||
Some(Token {
|
||||
word: "\u{2ec7}",
|
||||
word_index: 5,
|
||||
char_index: 23
|
||||
})
|
||||
);
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user