mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Cargo fmt pass
This commit is contained in:
parent
47d777c8f7
commit
ca26a0f2e4
@ -4,15 +4,15 @@ use std::error::Error;
|
|||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::time::{Instant, Duration};
|
use std::time::{Duration, Instant};
|
||||||
use std::{fs, io, sync::mpsc};
|
use std::{fs, io, sync::mpsc};
|
||||||
|
|
||||||
use rustyline::{Editor, Config};
|
use rustyline::{Config, Editor};
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||||
|
|
||||||
use meilidb_core::{Highlight, Database, UpdateResult};
|
use meilidb_core::{Database, Highlight, UpdateResult};
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
|
|
||||||
const INDEX_NAME: &str = "default";
|
const INDEX_NAME: &str = "default";
|
||||||
@ -91,7 +91,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
|||||||
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
|
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
|
||||||
let index = match database.open_index(INDEX_NAME) {
|
let index = match database.open_index(INDEX_NAME) {
|
||||||
Some(index) => index,
|
Some(index) => index,
|
||||||
None => database.create_index(INDEX_NAME).unwrap()
|
None => database.create_index(INDEX_NAME).unwrap(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
|
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
|
||||||
@ -108,14 +108,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
|||||||
match index.main.schema(&writer)? {
|
match index.main.schema(&writer)? {
|
||||||
Some(current_schema) => {
|
Some(current_schema) => {
|
||||||
if current_schema != schema {
|
if current_schema != schema {
|
||||||
return Err(meilidb_core::Error::SchemaDiffer.into())
|
return Err(meilidb_core::Error::SchemaDiffer.into());
|
||||||
}
|
}
|
||||||
writer.abort();
|
writer.abort();
|
||||||
},
|
}
|
||||||
None => {
|
None => {
|
||||||
index.schema_update(&mut writer, schema)?;
|
index.schema_update(&mut writer, schema)?;
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
|
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
|
||||||
@ -131,7 +131,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
|||||||
|
|
||||||
loop {
|
loop {
|
||||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||||
if end_of_file { break }
|
if end_of_file {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||||
Ok(document) => document,
|
Ok(document) => document,
|
||||||
@ -147,7 +149,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
|||||||
i += 1;
|
i += 1;
|
||||||
|
|
||||||
if let Some(group_size) = command.update_group_size {
|
if let Some(group_size) = command.update_group_size {
|
||||||
if i % group_size == 0 { break }
|
if i % group_size == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,15 +167,25 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
|||||||
|
|
||||||
println!("Waiting for update {}", max_update_id);
|
println!("Waiting for update {}", max_update_id);
|
||||||
for id in receiver {
|
for id in receiver {
|
||||||
if id == max_update_id { break }
|
if id == max_update_id {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path);
|
println!(
|
||||||
|
"database created in {:.2?} at: {:?}",
|
||||||
|
start.elapsed(),
|
||||||
|
command.database_path
|
||||||
|
);
|
||||||
|
|
||||||
if let Some(path) = command.compact_to_path {
|
if let Some(path) = command.compact_to_path {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let _file = database.copy_and_compact_to_path(&path)?;
|
let _file = database.copy_and_compact_to_path(&path)?;
|
||||||
println!("database compacted in {:.2?} at: {:?}", start.elapsed(), path);
|
println!(
|
||||||
|
"database compacted in {:.2?} at: {:?}",
|
||||||
|
start.elapsed(),
|
||||||
|
path
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -182,7 +196,10 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
|||||||
let mut highlighted = false;
|
let mut highlighted = false;
|
||||||
|
|
||||||
for range in ranges.windows(2) {
|
for range in ranges.windows(2) {
|
||||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
let [start, end] = match range {
|
||||||
|
[start, end] => [*start, *end],
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
if highlighted {
|
if highlighted {
|
||||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||||
}
|
}
|
||||||
@ -221,12 +238,14 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
|
|||||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||||
|
|
||||||
match byte_indexes.entry(byte_index) {
|
match byte_indexes.entry(byte_index) {
|
||||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
Entry::Vacant(entry) => {
|
||||||
|
entry.insert(byte_length);
|
||||||
|
}
|
||||||
Entry::Occupied(mut entry) => {
|
Entry::Occupied(mut entry) => {
|
||||||
if *entry.get() < byte_length {
|
if *entry.get() < byte_length {
|
||||||
entry.insert(byte_length);
|
entry.insert(byte_length);
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -252,22 +271,23 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
|
|||||||
/// ```
|
/// ```
|
||||||
fn crop_text(
|
fn crop_text(
|
||||||
text: &str,
|
text: &str,
|
||||||
highlights: impl IntoIterator<Item=Highlight>,
|
highlights: impl IntoIterator<Item = Highlight>,
|
||||||
context: usize,
|
context: usize,
|
||||||
) -> (String, Vec<Highlight>)
|
) -> (String, Vec<Highlight>) {
|
||||||
{
|
|
||||||
let mut highlights = highlights.into_iter().peekable();
|
let mut highlights = highlights.into_iter().peekable();
|
||||||
|
|
||||||
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
let char_index = highlights
|
||||||
|
.peek()
|
||||||
|
.map(|m| m.char_index as usize)
|
||||||
|
.unwrap_or(0);
|
||||||
let start = char_index.saturating_sub(context);
|
let start = char_index.saturating_sub(context);
|
||||||
let text = text.chars().skip(start).take(context * 2).collect();
|
let text = text.chars().skip(start).take(context * 2).collect();
|
||||||
|
|
||||||
let highlights = highlights
|
let highlights = highlights
|
||||||
.take_while(|m| {
|
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
|
||||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
.map(|highlight| Highlight {
|
||||||
})
|
char_index: highlight.char_index - start as u16,
|
||||||
.map(|highlight| {
|
..highlight
|
||||||
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
|
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@ -276,7 +296,9 @@ fn crop_text(
|
|||||||
|
|
||||||
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
|
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
|
||||||
let env = &database.env;
|
let env = &database.env;
|
||||||
let index = database.open_index(INDEX_NAME).expect("Could not find index");
|
let index = database
|
||||||
|
.open_index(INDEX_NAME)
|
||||||
|
.expect("Could not find index");
|
||||||
|
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
let schema = index.main.schema(&reader)?;
|
let schema = index.main.schema(&reader)?;
|
||||||
@ -312,10 +334,15 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
|||||||
(true, filter)
|
(true, filter)
|
||||||
};
|
};
|
||||||
|
|
||||||
let attr = schema.attribute(&filter).expect("Could not find filtered attribute");
|
let attr = schema
|
||||||
|
.attribute(&filter)
|
||||||
|
.expect("Could not find filtered attribute");
|
||||||
|
|
||||||
builder.with_filter(move |document_id| {
|
builder.with_filter(move |document_id| {
|
||||||
let string: String = ref_index.document_attribute(ref_reader, document_id, attr).unwrap().unwrap();
|
let string: String = ref_index
|
||||||
|
.document_attribute(ref_reader, document_id, attr)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
(string == "true") == positive
|
(string == "true") == positive
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -326,8 +353,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
|||||||
|
|
||||||
let number_of_documents = documents.len();
|
let number_of_documents = documents.len();
|
||||||
for mut doc in documents {
|
for mut doc in documents {
|
||||||
|
doc.highlights
|
||||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||||
|
|
||||||
let start_retrieve = Instant::now();
|
let start_retrieve = Instant::now();
|
||||||
let result = index.document::<Document>(&reader, Some(&fields), doc.id);
|
let result = index.document::<Document>(&reader, Some(&fields), doc.id);
|
||||||
@ -340,15 +367,18 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
|||||||
print!("{}: ", name);
|
print!("{}: ", name);
|
||||||
|
|
||||||
let attr = schema.attribute(&name).unwrap();
|
let attr = schema.attribute(&name).unwrap();
|
||||||
let highlights = doc.highlights.iter()
|
let highlights = doc
|
||||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
.highlights
|
||||||
.cloned();
|
.iter()
|
||||||
let (text, highlights) = crop_text(&text, highlights, command.char_context);
|
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||||
|
.cloned();
|
||||||
|
let (text, highlights) =
|
||||||
|
crop_text(&text, highlights, command.char_context);
|
||||||
let areas = create_highlight_areas(&text, &highlights);
|
let areas = create_highlight_areas(&text, &highlights);
|
||||||
display_highlights(&text, &areas)?;
|
display_highlights(&text, &areas)?;
|
||||||
println!();
|
println!();
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
Ok(None) => eprintln!("missing document"),
|
Ok(None) => eprintln!("missing document"),
|
||||||
Err(e) => eprintln!("{}", e),
|
Err(e) => eprintln!("{}", e),
|
||||||
}
|
}
|
||||||
@ -366,12 +396,19 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
|||||||
println!();
|
println!();
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration);
|
eprintln!(
|
||||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
"whole documents fields retrieve took {:.2?}",
|
||||||
},
|
retrieve_duration
|
||||||
|
);
|
||||||
|
eprintln!(
|
||||||
|
"===== Found {} results in {:.2?} =====",
|
||||||
|
number_of_documents,
|
||||||
|
start_total.elapsed()
|
||||||
|
);
|
||||||
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
println!("Error: {:?}", err);
|
println!("Error: {:?}", err);
|
||||||
break
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,5 @@
|
|||||||
|
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use levenshtein_automata::{
|
|
||||||
LevenshteinAutomatonBuilder as LevBuilder,
|
|
||||||
DFA,
|
|
||||||
};
|
|
||||||
|
|
||||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||||
@ -15,30 +12,30 @@ enum PrefixSetting {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||||
use PrefixSetting::{Prefix, NoPrefix};
|
use PrefixSetting::{NoPrefix, Prefix};
|
||||||
|
|
||||||
match query.len() {
|
match query.len() {
|
||||||
0 ..= 4 => {
|
0..=4 => {
|
||||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
||||||
match setting {
|
match setting {
|
||||||
Prefix => builder.build_prefix_dfa(query),
|
Prefix => builder.build_prefix_dfa(query),
|
||||||
NoPrefix => builder.build_dfa(query),
|
NoPrefix => builder.build_dfa(query),
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
5 ..= 8 => {
|
5..=8 => {
|
||||||
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||||
match setting {
|
match setting {
|
||||||
Prefix => builder.build_prefix_dfa(query),
|
Prefix => builder.build_prefix_dfa(query),
|
||||||
NoPrefix => builder.build_dfa(query),
|
NoPrefix => builder.build_dfa(query),
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||||
match setting {
|
match setting {
|
||||||
Prefix => builder.build_prefix_dfa(query),
|
Prefix => builder.build_prefix_dfa(query),
|
||||||
NoPrefix => builder.build_dfa(query),
|
NoPrefix => builder.build_dfa(query),
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,14 +6,14 @@ use std::vec;
|
|||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
use meilidb_tokenizer::{split_query_string, is_cjk};
|
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||||
|
|
||||||
use crate::store;
|
|
||||||
use crate::error::MResult;
|
use crate::error::MResult;
|
||||||
|
use crate::store;
|
||||||
|
|
||||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||||
use self::query_enhancer::QueryEnhancerBuilder;
|
|
||||||
pub use self::query_enhancer::QueryEnhancer;
|
pub use self::query_enhancer::QueryEnhancer;
|
||||||
|
use self::query_enhancer::QueryEnhancerBuilder;
|
||||||
|
|
||||||
const NGRAMS: usize = 3;
|
const NGRAMS: usize = 3;
|
||||||
|
|
||||||
@ -27,14 +27,9 @@ impl AutomatonProducer {
|
|||||||
query: &str,
|
query: &str,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
) -> MResult<(AutomatonProducer, QueryEnhancer)>
|
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||||
{
|
let (automatons, query_enhancer) =
|
||||||
let (automatons, query_enhancer) = generate_automatons(
|
generate_automatons(reader, query, main_store, synonyms_store)?;
|
||||||
reader,
|
|
||||||
query,
|
|
||||||
main_store,
|
|
||||||
synonyms_store,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||||
}
|
}
|
||||||
@ -112,8 +107,7 @@ fn generate_automatons(
|
|||||||
query: &str,
|
query: &str,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
synonym_store: store::Synonyms,
|
synonym_store: store::Synonyms,
|
||||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)>
|
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
||||||
{
|
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||||
@ -130,7 +124,6 @@ fn generate_automatons(
|
|||||||
let mut original_automatons = Vec::new();
|
let mut original_automatons = Vec::new();
|
||||||
let mut original_words = query_words.iter().peekable();
|
let mut original_words = query_words.iter().peekable();
|
||||||
while let Some(word) = original_words.next() {
|
while let Some(word) = original_words.next() {
|
||||||
|
|
||||||
let has_following_word = original_words.peek().is_some();
|
let has_following_word = original_words.peek().is_some();
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||||
|
|
||||||
@ -148,29 +141,33 @@ fn generate_automatons(
|
|||||||
for n in 1..=NGRAMS {
|
for n in 1..=NGRAMS {
|
||||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||||
|
|
||||||
let query_range = query_index..query_index + n;
|
let query_range = query_index..query_index + n;
|
||||||
let ngram_nb_words = ngram_slice.len();
|
let ngram_nb_words = ngram_slice.len();
|
||||||
let ngram = ngram_slice.join(" ");
|
let ngram = ngram_slice.join(" ");
|
||||||
|
|
||||||
let has_following_word = ngrams.peek().is_some();
|
let has_following_word = ngrams.peek().is_some();
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
let not_prefix_dfa =
|
||||||
|
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||||
|
|
||||||
// automaton of synonyms of the ngrams
|
// automaton of synonyms of the ngrams
|
||||||
let normalized = normalize_str(&ngram);
|
let normalized = normalize_str(&ngram);
|
||||||
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
|
let lev = if not_prefix_dfa {
|
||||||
|
build_dfa(&normalized)
|
||||||
|
} else {
|
||||||
|
build_prefix_dfa(&normalized)
|
||||||
|
};
|
||||||
|
|
||||||
let mut stream = synonyms.search(&lev).into_stream();
|
let mut stream = synonyms.search(&lev).into_stream();
|
||||||
while let Some(base) = stream.next() {
|
while let Some(base) = stream.next() {
|
||||||
|
|
||||||
// only trigger alternatives when the last word has been typed
|
// only trigger alternatives when the last word has been typed
|
||||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||||
let base = std::str::from_utf8(base).unwrap();
|
let base = std::str::from_utf8(base).unwrap();
|
||||||
let base_nb_words = split_query_string(base).count();
|
let base_nb_words = split_query_string(base).count();
|
||||||
if ngram_nb_words != base_nb_words { continue }
|
if ngram_nb_words != base_nb_words {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||||
|
|
||||||
let mut stream = synonyms.into_stream();
|
let mut stream = synonyms.into_stream();
|
||||||
while let Some(synonyms) = stream.next() {
|
while let Some(synonyms) = stream.next() {
|
||||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||||
@ -178,7 +175,11 @@ fn generate_automatons(
|
|||||||
let nb_synonym_words = synonyms_words.len();
|
let nb_synonym_words = synonyms_words.len();
|
||||||
|
|
||||||
let real_query_index = automaton_index;
|
let real_query_index = automaton_index;
|
||||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
enhancer_builder.declare(
|
||||||
|
query_range.clone(),
|
||||||
|
real_query_index,
|
||||||
|
&synonyms_words,
|
||||||
|
);
|
||||||
|
|
||||||
for synonym in synonyms_words {
|
for synonym in synonyms_words {
|
||||||
let automaton = if nb_synonym_words == 1 {
|
let automaton = if nb_synonym_words == 1 {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
|
||||||
|
|
||||||
/// Return `true` if the specified range can accept the given replacements words.
|
/// Return `true` if the specified range can accept the given replacements words.
|
||||||
/// Returns `false` if the replacements words are already present in the original query
|
/// Returns `false` if the replacements words are already present in the original query
|
||||||
@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal};
|
|||||||
// [new york city]
|
// [new york city]
|
||||||
//
|
//
|
||||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||||
where S: AsRef<str>,
|
where
|
||||||
T: AsRef<str>,
|
S: AsRef<str>,
|
||||||
|
T: AsRef<str>,
|
||||||
{
|
{
|
||||||
if words.len() <= range.len() {
|
if words.len() <= range.len() {
|
||||||
// there is fewer or equal replacement words
|
// there is fewer or equal replacement words
|
||||||
// than there is already in the replaced range
|
// than there is already in the replaced range
|
||||||
return false
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// retrieve the part to rewrite but with the length
|
// retrieve the part to rewrite but with the length
|
||||||
@ -49,7 +50,9 @@ where S: AsRef<str>,
|
|||||||
|
|
||||||
// check if the original query doesn't already contain
|
// check if the original query doesn't already contain
|
||||||
// the replacement words
|
// the replacement words
|
||||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
!original
|
||||||
|
.map(AsRef::as_ref)
|
||||||
|
.eq(words.iter().map(AsRef::as_ref))
|
||||||
}
|
}
|
||||||
|
|
||||||
type Origin = usize;
|
type Origin = usize;
|
||||||
@ -68,11 +71,20 @@ impl FakeIntervalTree {
|
|||||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||||
if point >= r.start {
|
if point >= r.start {
|
||||||
if point < r.end { Equal } else { Less }
|
if point < r.end {
|
||||||
} else { Greater }
|
Equal
|
||||||
|
} else {
|
||||||
|
Less
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Greater
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let n = match element { Ok(n) => n, Err(n) => n };
|
let n = match element {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(n) => n,
|
||||||
|
};
|
||||||
|
|
||||||
match self.intervals.get(n) {
|
match self.intervals.get(n) {
|
||||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||||
@ -91,9 +103,13 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||||
// we initialize origins query indices based on their positions
|
// we initialize origins query indices based on their positions
|
||||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
||||||
|
|
||||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
QueryEnhancerBuilder {
|
||||||
|
query,
|
||||||
|
origins,
|
||||||
|
real_to_origin,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update the final real to origin query indices mapping.
|
/// Update the final real to origin query indices mapping.
|
||||||
@ -101,12 +117,12 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||||||
/// `range` is the original words range that this `replacement` words replace
|
/// `range` is the original words range that this `replacement` words replace
|
||||||
/// and `real` is the first real query index of these replacement words.
|
/// and `real` is the first real query index of these replacement words.
|
||||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||||
where T: AsRef<str>,
|
where
|
||||||
|
T: AsRef<str>,
|
||||||
{
|
{
|
||||||
// check if the range of original words
|
// check if the range of original words
|
||||||
// can be rewritten with the replacement words
|
// can be rewritten with the replacement words
|
||||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||||
|
|
||||||
// this range can be replaced so we need to
|
// this range can be replaced so we need to
|
||||||
// modify the origins accordingly
|
// modify the origins accordingly
|
||||||
let offset = replacement.len() - range.len();
|
let offset = replacement.len() - range.len();
|
||||||
@ -126,7 +142,8 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||||||
// we need to pad real query indices
|
// we need to pad real query indices
|
||||||
let real_range = real..real + replacement.len().max(range.len());
|
let real_range = real..real + replacement.len().max(range.len());
|
||||||
let real_length = replacement.len();
|
let real_length = replacement.len();
|
||||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
self.real_to_origin
|
||||||
|
.push((real_range, (range.start, real_length)));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> QueryEnhancer {
|
pub fn build(self) -> QueryEnhancer {
|
||||||
@ -148,10 +165,10 @@ impl QueryEnhancer {
|
|||||||
let real = real as usize;
|
let real = real as usize;
|
||||||
|
|
||||||
// query the fake interval tree with the real query index
|
// query the fake interval tree with the real query index
|
||||||
let (range, (origin, real_length)) =
|
let (range, (origin, real_length)) = self
|
||||||
self.real_to_origin
|
.real_to_origin
|
||||||
.query(real)
|
.query(real)
|
||||||
.expect("real has never been declared");
|
.expect("real has never been declared");
|
||||||
|
|
||||||
// if `real` is the end bound of the range
|
// if `real` is the end bound of the range
|
||||||
if (range.start + real_length - 1) == real {
|
if (range.start + real_length - 1) == real {
|
||||||
@ -160,7 +177,10 @@ impl QueryEnhancer {
|
|||||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||||
let len = slice[1] - slice[0];
|
let len = slice[1] - slice[0];
|
||||||
count = count.saturating_sub(len);
|
count = count.saturating_sub(len);
|
||||||
if count == 0 { new_origin = origin + i; break }
|
if count == 0 {
|
||||||
|
new_origin = origin + i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let n = real - range.start;
|
let n = real - range.start;
|
||||||
@ -168,15 +188,20 @@ impl QueryEnhancer {
|
|||||||
let end = self.origins[new_origin + 1];
|
let end = self.origins[new_origin + 1];
|
||||||
let remaining = (end - start) - n;
|
let remaining = (end - start) - n;
|
||||||
|
|
||||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
Range {
|
||||||
|
start: (start + n) as u32,
|
||||||
|
end: (start + n + remaining) as u32,
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// just return the origin along with
|
// just return the origin along with
|
||||||
// the real position of the word
|
// the real position of the word
|
||||||
let n = real as usize - range.start;
|
let n = real as usize - range.start;
|
||||||
let origin = self.origins[origin];
|
let origin = self.origins[origin];
|
||||||
|
|
||||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
Range {
|
||||||
|
start: (origin + n) as u32,
|
||||||
|
end: (origin + n + 1) as u32,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -382,16 +407,16 @@ mod tests {
|
|||||||
|
|
||||||
let enhancer = builder.build();
|
let enhancer = builder.build();
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct DocumentId;
|
pub struct DocumentId;
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
use meilidb_schema::SchemaAttr;
|
|
||||||
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
@ -13,8 +13,7 @@ fn number_exact_matches(
|
|||||||
attribute: &[u16],
|
attribute: &[u16],
|
||||||
is_exact: &[bool],
|
is_exact: &[bool],
|
||||||
fields_counts: &Set<(SchemaAttr, u64)>,
|
fields_counts: &Set<(SchemaAttr, u64)>,
|
||||||
) -> usize
|
) -> usize {
|
||||||
{
|
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
|
|
||||||
@ -22,12 +21,16 @@ fn number_exact_matches(
|
|||||||
let len = group.len();
|
let len = group.len();
|
||||||
|
|
||||||
let mut found_exact = false;
|
let mut found_exact = false;
|
||||||
for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() {
|
for (pos, _) in is_exact[index..index + len]
|
||||||
|
.iter()
|
||||||
|
.filter(|x| **x)
|
||||||
|
.enumerate()
|
||||||
|
{
|
||||||
found_exact = true;
|
found_exact = true;
|
||||||
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
||||||
let (_, count) = fields_counts[pos];
|
let (_, count) = fields_counts[pos];
|
||||||
if count == 1 {
|
if count == 1 {
|
||||||
return usize::max_value()
|
return usize::max_value();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -81,18 +84,18 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn easy_case() {
|
fn easy_case() {
|
||||||
let doc0 = {
|
let doc0 = {
|
||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
|
|
||||||
let doc1 = {
|
let doc1 = {
|
||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[false];
|
let is_exact = &[false];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
@ -108,18 +111,18 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
let doc0 = {
|
let doc0 = {
|
||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
|
|
||||||
let doc1 = {
|
let doc1 = {
|
||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
|
@ -1,24 +1,20 @@
|
|||||||
mod sum_of_typos;
|
mod document_id;
|
||||||
|
mod exact;
|
||||||
mod number_of_words;
|
mod number_of_words;
|
||||||
mod words_proximity;
|
mod sort_by_attr;
|
||||||
|
mod sum_of_typos;
|
||||||
mod sum_of_words_attribute;
|
mod sum_of_words_attribute;
|
||||||
mod sum_of_words_position;
|
mod sum_of_words_position;
|
||||||
mod exact;
|
mod words_proximity;
|
||||||
mod sort_by_attr;
|
|
||||||
mod document_id;
|
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
pub use self::{
|
pub use self::{
|
||||||
sum_of_typos::SumOfTypos,
|
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
|
||||||
number_of_words::NumberOfWords,
|
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
|
||||||
|
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
|
||||||
words_proximity::WordsProximity,
|
words_proximity::WordsProximity,
|
||||||
sum_of_words_attribute::SumOfWordsAttribute,
|
|
||||||
sum_of_words_position::SumOfWordsPosition,
|
|
||||||
exact::Exact,
|
|
||||||
sort_by_attr::SortByAttr,
|
|
||||||
document_id::DocumentId,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub trait Criterion: Send + Sync {
|
pub trait Criterion: Send + Sync {
|
||||||
@ -62,17 +58,18 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct CriteriaBuilder<'a> {
|
pub struct CriteriaBuilder<'a> {
|
||||||
inner: Vec<Box<dyn Criterion + 'a>>
|
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> CriteriaBuilder<'a>
|
impl<'a> CriteriaBuilder<'a> {
|
||||||
{
|
|
||||||
pub fn new() -> CriteriaBuilder<'a> {
|
pub fn new() -> CriteriaBuilder<'a> {
|
||||||
CriteriaBuilder { inner: Vec::new() }
|
CriteriaBuilder { inner: Vec::new() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
CriteriaBuilder {
|
||||||
|
inner: Vec::with_capacity(capacity),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn reserve(&mut self, additional: usize) {
|
pub fn reserve(&mut self, additional: usize) {
|
||||||
@ -80,14 +77,16 @@ impl<'a> CriteriaBuilder<'a>
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||||
where C: Criterion,
|
where
|
||||||
|
C: Criterion,
|
||||||
{
|
{
|
||||||
self.push(criterion);
|
self.push(criterion);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn push<C: 'a>(&mut self, criterion: C)
|
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||||
where C: Criterion,
|
where
|
||||||
|
C: Criterion,
|
||||||
{
|
{
|
||||||
self.inner.push(Box::new(criterion));
|
self.inner.push(Box::new(criterion));
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use slice_group_by::GroupBy;
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||||
|
@ -2,9 +2,9 @@ use std::cmp::Ordering;
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use meilidb_schema::{Schema, SchemaAttr};
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::{RawDocument, RankedMap};
|
use crate::{RankedMap, RawDocument};
|
||||||
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
|
|
||||||
/// An helper struct that permit to sort documents by
|
/// An helper struct that permit to sort documents by
|
||||||
/// some of their stored attributes.
|
/// some of their stored attributes.
|
||||||
@ -51,8 +51,7 @@ impl<'a> SortByAttr<'a> {
|
|||||||
ranked_map: &'a RankedMap,
|
ranked_map: &'a RankedMap,
|
||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
attr_name: &str,
|
attr_name: &str,
|
||||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||||
{
|
|
||||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,8 +59,7 @@ impl<'a> SortByAttr<'a> {
|
|||||||
ranked_map: &'a RankedMap,
|
ranked_map: &'a RankedMap,
|
||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
attr_name: &str,
|
attr_name: &str,
|
||||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||||
{
|
|
||||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,8 +68,7 @@ impl<'a> SortByAttr<'a> {
|
|||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
attr_name: &str,
|
attr_name: &str,
|
||||||
reversed: bool,
|
reversed: bool,
|
||||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
) -> Result<SortByAttr<'a>, SortByAttrError> {
|
||||||
{
|
|
||||||
let attr = match schema.attribute(attr_name) {
|
let attr = match schema.attribute(attr_name) {
|
||||||
Some(attr) => attr,
|
Some(attr) => attr,
|
||||||
None => return Err(SortByAttrError::AttributeNotFound),
|
None => return Err(SortByAttrError::AttributeNotFound),
|
||||||
@ -81,7 +78,11 @@ impl<'a> SortByAttr<'a> {
|
|||||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SortByAttr { ranked_map, attr, reversed })
|
Ok(SortByAttr {
|
||||||
|
ranked_map,
|
||||||
|
attr,
|
||||||
|
reversed,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,11 +94,15 @@ impl<'a> Criterion for SortByAttr<'a> {
|
|||||||
match (lhs, rhs) {
|
match (lhs, rhs) {
|
||||||
(Some(lhs), Some(rhs)) => {
|
(Some(lhs), Some(rhs)) => {
|
||||||
let order = lhs.cmp(&rhs);
|
let order = lhs.cmp(&rhs);
|
||||||
if self.reversed { order.reverse() } else { order }
|
if self.reversed {
|
||||||
},
|
order.reverse()
|
||||||
(None, Some(_)) => Ordering::Greater,
|
} else {
|
||||||
(Some(_), None) => Ordering::Less,
|
order
|
||||||
(None, None) => Ordering::Equal,
|
}
|
||||||
|
}
|
||||||
|
(None, Some(_)) => Ordering::Greater,
|
||||||
|
(Some(_), None) => Ordering::Less,
|
||||||
|
(None, None) => Ordering::Equal,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,4 +127,4 @@ impl fmt::Display for SortByAttrError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Error for SortByAttrError { }
|
impl Error for SortByAttrError {}
|
||||||
|
@ -11,10 +11,10 @@ use crate::RawDocument;
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn custom_log10(n: u8) -> f32 {
|
fn custom_log10(n: u8) -> f32 {
|
||||||
match n {
|
match n {
|
||||||
0 => 0.0, // log(1)
|
0 => 0.0, // log(1)
|
||||||
1 => 0.30102, // log(2)
|
1 => 0.30102, // log(2)
|
||||||
2 => 0.47712, // log(3)
|
2 => 0.47712, // log(3)
|
||||||
3 => 0.60205, // log(4)
|
3 => 0.60205, // log(4)
|
||||||
_ => panic!("invalid number"),
|
_ => panic!("invalid number"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use slice_group_by::GroupBy;
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use slice_group_by::GroupBy;
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::{self, Ordering};
|
|
||||||
use slice_group_by::GroupBy;
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::RawDocument;
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use std::cmp::{self, Ordering};
|
||||||
|
|
||||||
const MAX_DISTANCE: u16 = 8;
|
const MAX_DISTANCE: u16 = 8;
|
||||||
|
|
||||||
@ -19,7 +19,9 @@ fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||||
if lattr != rattr { return MAX_DISTANCE }
|
if lattr != rattr {
|
||||||
|
return MAX_DISTANCE;
|
||||||
|
}
|
||||||
index_proximity(lwi, rwi)
|
index_proximity(lwi, rwi)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,15 +44,18 @@ fn matches_proximity(
|
|||||||
distance: &[u8],
|
distance: &[u8],
|
||||||
attribute: &[u16],
|
attribute: &[u16],
|
||||||
word_index: &[u16],
|
word_index: &[u16],
|
||||||
) -> u16
|
) -> u16 {
|
||||||
{
|
|
||||||
let mut query_index_groups = query_index.linear_group();
|
let mut query_index_groups = query_index.linear_group();
|
||||||
let mut proximity = 0;
|
let mut proximity = 0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
|
|
||||||
let get_attr_wi = |index: usize, group_len: usize| {
|
let get_attr_wi = |index: usize, group_len: usize| {
|
||||||
// retrieve the first distance group (with the lowest values)
|
// retrieve the first distance group (with the lowest values)
|
||||||
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
|
let len = distance[index..index + group_len]
|
||||||
|
.linear_group()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.len();
|
||||||
|
|
||||||
let rattr = &attribute[index..index + len];
|
let rattr = &attribute[index..index + len];
|
||||||
let rwi = &word_index[index..index + len];
|
let rwi = &word_index[index..index + len];
|
||||||
@ -110,7 +115,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn three_different_attributes() {
|
fn three_different_attributes() {
|
||||||
|
|
||||||
// "soup" "of the" "the day"
|
// "soup" "of the" "the day"
|
||||||
//
|
//
|
||||||
// { id: 0, attr: 0, attr_index: 0 }
|
// { id: 0, attr: 0, attr_index: 0 }
|
||||||
@ -120,19 +124,21 @@ mod tests {
|
|||||||
// { id: 3, attr: 3, attr_index: 1 }
|
// { id: 3, attr: 3, attr_index: 1 }
|
||||||
|
|
||||||
let query_index = &[0, 1, 2, 2, 3];
|
let query_index = &[0, 1, 2, 2, 3];
|
||||||
let distance = &[0, 0, 0, 0, 0];
|
let distance = &[0, 0, 0, 0, 0];
|
||||||
let attribute = &[0, 1, 1, 2, 3];
|
let attribute = &[0, 1, 1, 2, 3];
|
||||||
let word_index = &[0, 0, 1, 0, 1];
|
let word_index = &[0, 0, 1, 0, 1];
|
||||||
|
|
||||||
// soup -> of = 8
|
// soup -> of = 8
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 8 (not 1)
|
// + the -> day = 8 (not 1)
|
||||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
|
assert_eq!(
|
||||||
|
matches_proximity(query_index, distance, attribute, word_index),
|
||||||
|
17
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn two_different_attributes() {
|
fn two_different_attributes() {
|
||||||
|
|
||||||
// "soup day" "soup of the day"
|
// "soup day" "soup of the day"
|
||||||
//
|
//
|
||||||
// { id: 0, attr: 0, attr_index: 0 }
|
// { id: 0, attr: 0, attr_index: 0 }
|
||||||
@ -143,13 +149,16 @@ mod tests {
|
|||||||
// { id: 3, attr: 1, attr_index: 3 }
|
// { id: 3, attr: 1, attr_index: 3 }
|
||||||
|
|
||||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||||
let distance = &[0, 0, 0, 0, 0, 0];
|
let distance = &[0, 0, 0, 0, 0, 0];
|
||||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||||
|
|
||||||
// soup -> of = 1
|
// soup -> of = 1
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 1
|
// + the -> day = 1
|
||||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
|
assert_eq!(
|
||||||
|
matches_proximity(query_index, distance, attribute, word_index),
|
||||||
|
3
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
use std::collections::hash_map::{HashMap, Entry};
|
use std::collections::hash_map::{Entry, HashMap};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::{fs, thread};
|
use std::{fs, thread};
|
||||||
|
|
||||||
use zlmdb::{Result as ZResult, CompactionOption};
|
|
||||||
use zlmdb::types::{Str, Unit};
|
|
||||||
use crossbeam_channel::Receiver;
|
use crossbeam_channel::Receiver;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
|
use zlmdb::types::{Str, Unit};
|
||||||
|
use zlmdb::{CompactionOption, Result as ZResult};
|
||||||
|
|
||||||
use crate::{store, update, Index, MResult};
|
use crate::{store, update, Index, MResult};
|
||||||
|
|
||||||
@ -32,20 +32,32 @@ fn update_awaiter(
|
|||||||
loop {
|
loop {
|
||||||
let mut writer = match env.write_txn() {
|
let mut writer = match env.write_txn() {
|
||||||
Ok(writer) => writer,
|
Ok(writer) => writer,
|
||||||
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
|
Err(e) => {
|
||||||
|
error!("LMDB writer transaction begin failed: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
match update::update_task(&mut writer, index.clone()) {
|
match update::update_task(&mut writer, index.clone()) {
|
||||||
Ok(Some(status)) => {
|
Ok(Some(status)) => {
|
||||||
if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) }
|
if let Err(e) = writer.commit() {
|
||||||
|
error!("update transaction failed: {}", e)
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(ref callback) = *update_fn.load() {
|
if let Some(ref callback) = *update_fn.load() {
|
||||||
(callback)(status);
|
(callback)(status);
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
// no more updates to handle for now
|
// no more updates to handle for now
|
||||||
Ok(None) => { debug!("no more updates"); writer.abort(); break },
|
Ok(None) => {
|
||||||
Err(e) => { error!("update task failed: {}", e); writer.abort() },
|
debug!("no more updates");
|
||||||
|
writer.abort();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("update task failed: {}", e);
|
||||||
|
writer.abort()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,14 +88,16 @@ impl Database {
|
|||||||
// open the previously aggregated indexes
|
// open the previously aggregated indexes
|
||||||
let mut indexes = HashMap::new();
|
let mut indexes = HashMap::new();
|
||||||
for index_name in must_open {
|
for index_name in must_open {
|
||||||
|
|
||||||
let (sender, receiver) = crossbeam_channel::bounded(100);
|
let (sender, receiver) = crossbeam_channel::bounded(100);
|
||||||
let index = match store::open(&env, &index_name, sender.clone())? {
|
let index = match store::open(&env, &index_name, sender.clone())? {
|
||||||
Some(index) => index,
|
Some(index) => index,
|
||||||
None => {
|
None => {
|
||||||
log::warn!("the index {} doesn't exist or has not all the databases", index_name);
|
log::warn!(
|
||||||
|
"the index {} doesn't exist or has not all the databases",
|
||||||
|
index_name
|
||||||
|
);
|
||||||
continue;
|
continue;
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
let update_fn = Arc::new(ArcSwapFn::empty());
|
let update_fn = Arc::new(ArcSwapFn::empty());
|
||||||
|
|
||||||
@ -100,10 +114,18 @@ impl Database {
|
|||||||
sender.send(()).unwrap();
|
sender.send(()).unwrap();
|
||||||
|
|
||||||
let result = indexes.insert(index_name, (index, update_fn, handle));
|
let result = indexes.insert(index_name, (index, update_fn, handle));
|
||||||
assert!(result.is_none(), "The index should not have been already open");
|
assert!(
|
||||||
|
result.is_none(),
|
||||||
|
"The index should not have been already open"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Database { env, common_store, indexes_store, indexes: RwLock::new(indexes) })
|
Ok(Database {
|
||||||
|
env,
|
||||||
|
common_store,
|
||||||
|
indexes_store,
|
||||||
|
indexes: RwLock::new(indexes),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
|
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
|
||||||
@ -152,7 +174,7 @@ impl Database {
|
|||||||
let update_fn = Some(Arc::new(update_fn));
|
let update_fn = Some(Arc::new(update_fn));
|
||||||
current_update_fn.swap(update_fn);
|
current_update_fn.swap(update_fn);
|
||||||
true
|
true
|
||||||
},
|
}
|
||||||
None => false,
|
None => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -160,7 +182,10 @@ impl Database {
|
|||||||
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
|
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
|
||||||
let indexes_lock = self.indexes.read().unwrap();
|
let indexes_lock = self.indexes.read().unwrap();
|
||||||
match indexes_lock.get(name.as_ref()) {
|
match indexes_lock.get(name.as_ref()) {
|
||||||
Some((_, current_update_fn, _)) => { current_update_fn.swap(None); true },
|
Some((_, current_update_fn, _)) => {
|
||||||
|
current_update_fn.swap(None);
|
||||||
|
true
|
||||||
|
}
|
||||||
None => false,
|
None => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::hash::Hash;
|
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
|
use std::hash::Hash;
|
||||||
|
|
||||||
pub struct DistinctMap<K> {
|
pub struct DistinctMap<K> {
|
||||||
inner: HashMap<K, usize>,
|
inner: HashMap<K, usize>,
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::{error, fmt, io};
|
use crate::serde::{DeserializerError, SerializerError};
|
||||||
use serde_json::Error as SerdeJsonError;
|
use serde_json::Error as SerdeJsonError;
|
||||||
use crate::serde::{SerializerError, DeserializerError};
|
use std::{error, fmt, io};
|
||||||
|
|
||||||
pub type MResult<T> = Result<T, Error>;
|
pub type MResult<T> = Result<T, Error>;
|
||||||
|
|
||||||
@ -90,7 +90,7 @@ impl fmt::Display for Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl error::Error for Error { }
|
impl error::Error for Error {}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum UnsupportedOperation {
|
pub enum UnsupportedOperation {
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[macro_use] extern crate assert_matches;
|
#[macro_use]
|
||||||
|
extern crate assert_matches;
|
||||||
|
|
||||||
mod automaton;
|
mod automaton;
|
||||||
|
pub mod criterion;
|
||||||
mod database;
|
mod database;
|
||||||
mod distinct_map;
|
mod distinct_map;
|
||||||
mod error;
|
mod error;
|
||||||
@ -9,31 +11,41 @@ mod number;
|
|||||||
mod query_builder;
|
mod query_builder;
|
||||||
mod ranked_map;
|
mod ranked_map;
|
||||||
mod raw_document;
|
mod raw_document;
|
||||||
mod reordered_attrs;
|
|
||||||
mod update;
|
|
||||||
pub mod criterion;
|
|
||||||
pub mod raw_indexer;
|
pub mod raw_indexer;
|
||||||
|
mod reordered_attrs;
|
||||||
pub mod serde;
|
pub mod serde;
|
||||||
pub mod store;
|
pub mod store;
|
||||||
|
mod update;
|
||||||
|
|
||||||
pub use self::database::{Database, BoxUpdateFn};
|
pub use self::database::{BoxUpdateFn, Database};
|
||||||
pub use self::error::{Error, MResult};
|
pub use self::error::{Error, MResult};
|
||||||
pub use self::number::{Number, ParseNumberError};
|
pub use self::number::{Number, ParseNumberError};
|
||||||
pub use self::ranked_map::RankedMap;
|
pub use self::ranked_map::RankedMap;
|
||||||
pub use self::raw_document::RawDocument;
|
pub use self::raw_document::RawDocument;
|
||||||
pub use self::store::Index;
|
pub use self::store::Index;
|
||||||
pub use self::update::{UpdateStatus, UpdateResult, UpdateType};
|
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
|
||||||
|
|
||||||
|
use ::serde::{Deserialize, Serialize};
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
use ::serde::{Serialize, Deserialize};
|
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
/// Represent an internally generated document unique identifier.
|
||||||
///
|
///
|
||||||
/// It is used to inform the database the document you want to deserialize.
|
/// It is used to inform the database the document you want to deserialize.
|
||||||
/// Helpful for custom ranking.
|
/// Helpful for custom ranking.
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
#[derive(
|
||||||
#[derive(Serialize, Deserialize)]
|
Debug,
|
||||||
#[derive(AsBytes, FromBytes)]
|
Copy,
|
||||||
|
Clone,
|
||||||
|
Eq,
|
||||||
|
PartialEq,
|
||||||
|
PartialOrd,
|
||||||
|
Ord,
|
||||||
|
Hash,
|
||||||
|
Serialize,
|
||||||
|
Deserialize,
|
||||||
|
AsBytes,
|
||||||
|
FromBytes,
|
||||||
|
)]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocumentId(pub u64);
|
pub struct DocumentId(pub u64);
|
||||||
|
|
||||||
@ -42,8 +54,7 @@ pub struct DocumentId(pub u64);
|
|||||||
///
|
///
|
||||||
/// This is stored in the map, generated at index time,
|
/// This is stored in the map, generated at index time,
|
||||||
/// extracted and interpreted at search time.
|
/// extracted and interpreted at search time.
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
|
||||||
#[derive(AsBytes, FromBytes)]
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocIndex {
|
pub struct DocIndex {
|
||||||
/// The document identifier where the word was found.
|
/// The document identifier where the word was found.
|
||||||
@ -109,7 +120,10 @@ pub struct Document {
|
|||||||
impl Document {
|
impl Document {
|
||||||
#[cfg(not(test))]
|
#[cfg(not(test))]
|
||||||
fn from_raw(raw: RawDocument) -> Document {
|
fn from_raw(raw: RawDocument) -> Document {
|
||||||
Document { id: raw.id, highlights: raw.highlights }
|
Document {
|
||||||
|
id: raw.id,
|
||||||
|
highlights: raw.highlights,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -134,7 +148,11 @@ impl Document {
|
|||||||
matches.push(match_);
|
matches.push(match_);
|
||||||
}
|
}
|
||||||
|
|
||||||
Document { id: raw.id, matches, highlights: raw.highlights }
|
Document {
|
||||||
|
id: raw.id,
|
||||||
|
matches,
|
||||||
|
highlights: raw.highlights,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
use std::num::{ParseIntError, ParseFloatError};
|
|
||||||
use std::str::FromStr;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use std::num::{ParseFloatError, ParseIntError};
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
pub enum Number {
|
pub enum Number {
|
||||||
Unsigned(u64),
|
Unsigned(u64),
|
||||||
Signed(i64),
|
Signed(i64),
|
||||||
@ -32,7 +31,11 @@ impl FromStr for Number {
|
|||||||
Err(error) => error,
|
Err(error) => error,
|
||||||
};
|
};
|
||||||
|
|
||||||
Err(ParseNumberError { uint_error, int_error, float_error })
|
Err(ParseNumberError {
|
||||||
|
uint_error,
|
||||||
|
int_error,
|
||||||
|
float_error,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -46,10 +49,17 @@ pub struct ParseNumberError {
|
|||||||
impl fmt::Display for ParseNumberError {
|
impl fmt::Display for ParseNumberError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
if self.uint_error == self.int_error {
|
if self.uint_error == self.int_error {
|
||||||
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
|
write!(
|
||||||
|
f,
|
||||||
|
"can not parse number: {}, {}",
|
||||||
|
self.uint_error, self.float_error
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
write!(f, "can not parse number: {}, {}, {}",
|
write!(
|
||||||
self.uint_error, self.int_error, self.float_error)
|
f,
|
||||||
|
"can not parse number: {}, {}, {}",
|
||||||
|
self.uint_error, self.int_error, self.float_error
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,17 +2,17 @@ use hashbrown::HashMap;
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
use std::time::{Instant, Duration};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
|
||||||
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
||||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||||
use crate::raw_document::{RawDocument, raw_documents_from};
|
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||||
use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria};
|
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||||
use crate::{store, MResult, reordered_attrs::ReorderedAttrs};
|
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
||||||
|
|
||||||
pub struct QueryBuilder<'c, 'f, 'd> {
|
pub struct QueryBuilder<'c, 'f, 'd> {
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
@ -29,8 +29,7 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
|||||||
fn multiword_rewrite_matches(
|
fn multiword_rewrite_matches(
|
||||||
mut matches: Vec<(DocumentId, TmpMatch)>,
|
mut matches: Vec<(DocumentId, TmpMatch)>,
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
) -> SetBuf<(DocumentId, TmpMatch)>
|
) -> SetBuf<(DocumentId, TmpMatch)> {
|
||||||
{
|
|
||||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||||
|
|
||||||
// we sort the matches by word index to make them rewritable
|
// we sort the matches by word index to make them rewritable
|
||||||
@ -38,7 +37,6 @@ fn multiword_rewrite_matches(
|
|||||||
|
|
||||||
// for each attribute of each document
|
// for each attribute of each document
|
||||||
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
|
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
|
||||||
|
|
||||||
// padding will only be applied
|
// padding will only be applied
|
||||||
// to word indices in the same attribute
|
// to word indices in the same attribute
|
||||||
let mut padding = 0;
|
let mut padding = 0;
|
||||||
@ -47,18 +45,20 @@ fn multiword_rewrite_matches(
|
|||||||
// for each match at the same position
|
// for each match at the same position
|
||||||
// in this document attribute
|
// in this document attribute
|
||||||
while let Some(same_word_index) = iter.next() {
|
while let Some(same_word_index) = iter.next() {
|
||||||
|
|
||||||
// find the biggest padding
|
// find the biggest padding
|
||||||
let mut biggest = 0;
|
let mut biggest = 0;
|
||||||
for (id, match_) in same_word_index {
|
for (id, match_) in same_word_index {
|
||||||
|
|
||||||
let mut replacement = query_enhancer.replacement(match_.query_index);
|
let mut replacement = query_enhancer.replacement(match_.query_index);
|
||||||
let replacement_len = replacement.len();
|
let replacement_len = replacement.len();
|
||||||
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
|
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
|
||||||
|
|
||||||
if let Some(query_index) = replacement.next() {
|
if let Some(query_index) = replacement.next() {
|
||||||
let word_index = match_.word_index + padding as u16;
|
let word_index = match_.word_index + padding as u16;
|
||||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
let match_ = TmpMatch {
|
||||||
|
query_index,
|
||||||
|
word_index,
|
||||||
|
..match_.clone()
|
||||||
|
};
|
||||||
padded_matches.push((*id, match_));
|
padded_matches.push((*id, match_));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -67,22 +67,30 @@ fn multiword_rewrite_matches(
|
|||||||
// look ahead and if there already is a match
|
// look ahead and if there already is a match
|
||||||
// corresponding to this padding word, abort the padding
|
// corresponding to this padding word, abort the padding
|
||||||
'padding: for (x, next_group) in nexts.enumerate() {
|
'padding: for (x, next_group) in nexts.enumerate() {
|
||||||
|
|
||||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
let padmatch = TmpMatch { query_index, word_index, ..match_.clone() };
|
let padmatch = TmpMatch {
|
||||||
|
query_index,
|
||||||
|
word_index,
|
||||||
|
..match_.clone()
|
||||||
|
};
|
||||||
|
|
||||||
for (_, nmatch_) in next_group {
|
for (_, nmatch_) in next_group {
|
||||||
let mut rep = query_enhancer.replacement(nmatch_.query_index);
|
let mut rep = query_enhancer.replacement(nmatch_.query_index);
|
||||||
let query_index = rep.next().unwrap();
|
let query_index = rep.next().unwrap();
|
||||||
if query_index == padmatch.query_index {
|
if query_index == padmatch.query_index {
|
||||||
|
|
||||||
if !found {
|
if !found {
|
||||||
// if we find a corresponding padding for the
|
// if we find a corresponding padding for the
|
||||||
// first time we must push preceding paddings
|
// first time we must push preceding paddings
|
||||||
for (i, query_index) in replacement.clone().enumerate().take(i) {
|
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
{
|
||||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
let word_index =
|
||||||
|
match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
|
let match_ = TmpMatch {
|
||||||
|
query_index,
|
||||||
|
word_index,
|
||||||
|
..match_.clone()
|
||||||
|
};
|
||||||
padded_matches.push((*id, match_));
|
padded_matches.push((*id, match_));
|
||||||
biggest = biggest.max(i + 1);
|
biggest = biggest.max(i + 1);
|
||||||
}
|
}
|
||||||
@ -97,7 +105,7 @@ fn multiword_rewrite_matches(
|
|||||||
|
|
||||||
// if we do not find a corresponding padding in the
|
// if we do not find a corresponding padding in the
|
||||||
// next groups so stop here and pad what was found
|
// next groups so stop here and pad what was found
|
||||||
break
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !found {
|
if !found {
|
||||||
@ -105,7 +113,11 @@ fn multiword_rewrite_matches(
|
|||||||
// we must insert the entire padding
|
// we must insert the entire padding
|
||||||
for (i, query_index) in replacement.enumerate() {
|
for (i, query_index) in replacement.enumerate() {
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
|
let match_ = TmpMatch {
|
||||||
|
query_index,
|
||||||
|
word_index,
|
||||||
|
..match_.clone()
|
||||||
|
};
|
||||||
padded_matches.push((*id, match_));
|
padded_matches.push((*id, match_));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,13 +144,17 @@ fn fetch_raw_documents(
|
|||||||
main_store: &store::Main,
|
main_store: &store::Main,
|
||||||
postings_lists_store: &store::PostingsLists,
|
postings_lists_store: &store::PostingsLists,
|
||||||
documents_fields_counts_store: &store::DocumentsFieldsCounts,
|
documents_fields_counts_store: &store::DocumentsFieldsCounts,
|
||||||
) -> MResult<Vec<RawDocument>>
|
) -> MResult<Vec<RawDocument>> {
|
||||||
{
|
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
let mut highlights = Vec::new();
|
let mut highlights = Vec::new();
|
||||||
|
|
||||||
for automaton in automatons {
|
for automaton in automatons {
|
||||||
let Automaton { index, is_exact, query_len, .. } = automaton;
|
let Automaton {
|
||||||
|
index,
|
||||||
|
is_exact,
|
||||||
|
query_len,
|
||||||
|
..
|
||||||
|
} = automaton;
|
||||||
let dfa = automaton.dfa();
|
let dfa = automaton.dfa();
|
||||||
|
|
||||||
let words = match main_store.words_fst(reader)? {
|
let words = match main_store.words_fst(reader)? {
|
||||||
@ -210,8 +226,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
) -> QueryBuilder<'c, 'f, 'd>
|
) -> QueryBuilder<'c, 'f, 'd> {
|
||||||
{
|
|
||||||
QueryBuilder::with_criteria(
|
QueryBuilder::with_criteria(
|
||||||
main,
|
main,
|
||||||
postings_lists,
|
postings_lists,
|
||||||
@ -227,8 +242,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
) -> QueryBuilder<'c, 'f, 'd>
|
) -> QueryBuilder<'c, 'f, 'd> {
|
||||||
{
|
|
||||||
QueryBuilder {
|
QueryBuilder {
|
||||||
criteria,
|
criteria,
|
||||||
searchable_attrs: None,
|
searchable_attrs: None,
|
||||||
@ -245,7 +259,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
|
|
||||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||||
pub fn with_filter<F>(&mut self, function: F)
|
pub fn with_filter<F>(&mut self, function: F)
|
||||||
where F: Fn(DocumentId) -> bool + 'f,
|
where
|
||||||
|
F: Fn(DocumentId) -> bool + 'f,
|
||||||
{
|
{
|
||||||
self.filter = Some(Box::new(function))
|
self.filter = Some(Box::new(function))
|
||||||
}
|
}
|
||||||
@ -255,13 +270,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_distinct<F, K>(&mut self, function: F, size: usize)
|
pub fn with_distinct<F, K>(&mut self, function: F, size: usize)
|
||||||
where F: Fn(DocumentId) -> Option<u64> + 'd,
|
where
|
||||||
|
F: Fn(DocumentId) -> Option<u64> + 'd,
|
||||||
{
|
{
|
||||||
self.distinct = Some((Box::new(function), size))
|
self.distinct = Some((Box::new(function), size))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_searchable_attribute(&mut self, attribute: u16) {
|
pub fn add_searchable_attribute(&mut self, attribute: u16) {
|
||||||
let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new);
|
let reorders = self
|
||||||
|
.searchable_attrs
|
||||||
|
.get_or_insert_with(ReorderedAttrs::new);
|
||||||
reorders.insert_attribute(attribute);
|
reorders.insert_attribute(attribute);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,41 +288,36 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
query: &str,
|
query: &str,
|
||||||
range: Range<usize>,
|
range: Range<usize>,
|
||||||
) -> MResult<Vec<Document>>
|
) -> MResult<Vec<Document>> {
|
||||||
{
|
|
||||||
match self.distinct {
|
match self.distinct {
|
||||||
Some((distinct, distinct_size)) => {
|
Some((distinct, distinct_size)) => raw_query_with_distinct(
|
||||||
raw_query_with_distinct(
|
reader,
|
||||||
reader,
|
query,
|
||||||
query,
|
range,
|
||||||
range,
|
self.filter,
|
||||||
self.filter,
|
distinct,
|
||||||
distinct,
|
distinct_size,
|
||||||
distinct_size,
|
self.timeout,
|
||||||
self.timeout,
|
self.criteria,
|
||||||
self.criteria,
|
self.searchable_attrs,
|
||||||
self.searchable_attrs,
|
self.main_store,
|
||||||
self.main_store,
|
self.postings_lists_store,
|
||||||
self.postings_lists_store,
|
self.documents_fields_counts_store,
|
||||||
self.documents_fields_counts_store,
|
self.synonyms_store,
|
||||||
self.synonyms_store,
|
),
|
||||||
)
|
None => raw_query(
|
||||||
},
|
reader,
|
||||||
None => {
|
query,
|
||||||
raw_query(
|
range,
|
||||||
reader,
|
self.filter,
|
||||||
query,
|
self.timeout,
|
||||||
range,
|
self.criteria,
|
||||||
self.filter,
|
self.searchable_attrs,
|
||||||
self.timeout,
|
self.main_store,
|
||||||
self.criteria,
|
self.postings_lists_store,
|
||||||
self.searchable_attrs,
|
self.documents_fields_counts_store,
|
||||||
self.main_store,
|
self.synonyms_store,
|
||||||
self.postings_lists_store,
|
),
|
||||||
self.documents_fields_counts_store,
|
|
||||||
self.synonyms_store,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -326,7 +339,8 @@ fn raw_query<'c, FI>(
|
|||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
) -> MResult<Vec<Document>>
|
) -> MResult<Vec<Document>>
|
||||||
where FI: Fn(DocumentId) -> bool,
|
where
|
||||||
|
FI: Fn(DocumentId) -> bool,
|
||||||
{
|
{
|
||||||
// We delegate the filter work to the distinct query builder,
|
// We delegate the filter work to the distinct query builder,
|
||||||
// specifying a distinct rule that has no effect.
|
// specifying a distinct rule that has no effect.
|
||||||
@ -347,18 +361,14 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
documents_fields_counts_store,
|
documents_fields_counts_store,
|
||||||
synonyms_store,
|
synonyms_store,
|
||||||
)
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let start_processing = Instant::now();
|
let start_processing = Instant::now();
|
||||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||||
|
|
||||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
let (automaton_producer, query_enhancer) =
|
||||||
reader,
|
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||||
query,
|
|
||||||
main_store,
|
|
||||||
synonyms_store,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let mut automaton_producer = automaton_producer.into_iter();
|
let mut automaton_producer = automaton_producer.into_iter();
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
@ -382,7 +392,7 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
if let Some(timeout) = timeout {
|
if let Some(timeout) = timeout {
|
||||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||||
break
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -409,20 +419,27 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
|
|
||||||
// we have sort enough documents if the last document sorted is after
|
// we have sort enough documents if the last document sorted is after
|
||||||
// the end of the requested range, we can continue to the next criterion
|
// the end of the requested range, we can continue to the next criterion
|
||||||
if documents_seen >= range.end { continue 'criteria }
|
if documents_seen >= range.end {
|
||||||
|
continue 'criteria;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// once we classified the documents related to the current
|
// once we classified the documents related to the current
|
||||||
// automatons we save that as the next valid result
|
// automatons we save that as the next valid result
|
||||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
let iter = raw_documents
|
||||||
|
.into_iter()
|
||||||
|
.skip(range.start)
|
||||||
|
.take(range.len());
|
||||||
raw_documents_processed.clear();
|
raw_documents_processed.clear();
|
||||||
raw_documents_processed.extend(iter);
|
raw_documents_processed.extend(iter);
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
if let Some(timeout) = timeout {
|
if let Some(timeout) = timeout {
|
||||||
if start_processing.elapsed() > timeout { break }
|
if start_processing.elapsed() > timeout {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -456,18 +473,15 @@ fn raw_query_with_distinct<'c, FI, FD>(
|
|||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
) -> MResult<Vec<Document>>
|
) -> MResult<Vec<Document>>
|
||||||
where FI: Fn(DocumentId) -> bool,
|
where
|
||||||
FD: Fn(DocumentId) -> Option<u64>,
|
FI: Fn(DocumentId) -> bool,
|
||||||
|
FD: Fn(DocumentId) -> Option<u64>,
|
||||||
{
|
{
|
||||||
let start_processing = Instant::now();
|
let start_processing = Instant::now();
|
||||||
let mut raw_documents_processed = Vec::new();
|
let mut raw_documents_processed = Vec::new();
|
||||||
|
|
||||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
let (automaton_producer, query_enhancer) =
|
||||||
reader,
|
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||||
query,
|
|
||||||
main_store,
|
|
||||||
synonyms_store,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let mut automaton_producer = automaton_producer.into_iter();
|
let mut automaton_producer = automaton_producer.into_iter();
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
@ -491,7 +505,7 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
if let Some(timeout) = timeout {
|
if let Some(timeout) = timeout {
|
||||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||||
break
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -528,7 +542,7 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
Some(filter) => {
|
Some(filter) => {
|
||||||
let entry = filter_map.entry(document.id);
|
let entry = filter_map.entry(document.id);
|
||||||
*entry.or_insert_with(|| (filter)(document.id))
|
*entry.or_insert_with(|| (filter)(document.id))
|
||||||
},
|
}
|
||||||
None => true,
|
None => true,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -543,7 +557,9 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the requested range end is reached: stop computing distinct
|
// the requested range end is reached: stop computing distinct
|
||||||
if buf_distinct.len() >= range.end { break }
|
if buf_distinct.len() >= range.end {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
documents_seen += group.len();
|
documents_seen += group.len();
|
||||||
@ -558,7 +574,9 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
|
|
||||||
// we have sort enough documents if the last document sorted is after
|
// we have sort enough documents if the last document sorted is after
|
||||||
// the end of the requested range, we can continue to the next criterion
|
// the end of the requested range, we can continue to the next criterion
|
||||||
if buf_distinct.len() >= range.end { continue 'criteria }
|
if buf_distinct.len() >= range.end {
|
||||||
|
continue 'criteria;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -583,14 +601,18 @@ where FI: Fn(DocumentId) -> bool,
|
|||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
if distinct_accepted && seen.len() > range.start {
|
||||||
raw_documents_processed.push(document);
|
raw_documents_processed.push(document);
|
||||||
if raw_documents_processed.len() == range.len() { break }
|
if raw_documents_processed.len() == range.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
if let Some(timeout) = timeout {
|
if let Some(timeout) = timeout {
|
||||||
if start_processing.elapsed() > timeout { break }
|
if start_processing.elapsed() > timeout {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -611,20 +633,20 @@ mod tests {
|
|||||||
use std::collections::{BTreeSet, HashMap};
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use fst::{Set, IntoStreamer};
|
use fst::{IntoStreamer, Set};
|
||||||
|
use meilidb_schema::SchemaAttr;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
use meilidb_schema::SchemaAttr;
|
|
||||||
|
|
||||||
use crate::automaton::normalize_str;
|
use crate::automaton::normalize_str;
|
||||||
use crate::database::Database;
|
use crate::database::Database;
|
||||||
use crate::DocIndex;
|
|
||||||
use crate::store::Index;
|
use crate::store::Index;
|
||||||
|
use crate::DocIndex;
|
||||||
|
|
||||||
fn set_from_stream<'f, I, S>(stream: I) -> Set
|
fn set_from_stream<'f, I, S>(stream: I) -> Set
|
||||||
where
|
where
|
||||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>,
|
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||||
S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>,
|
S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
|
||||||
{
|
{
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
builder.extend_stream(stream).unwrap();
|
builder.extend_stream(stream).unwrap();
|
||||||
@ -687,14 +709,23 @@ mod tests {
|
|||||||
|
|
||||||
let word = word.to_lowercase();
|
let word = word.to_lowercase();
|
||||||
|
|
||||||
let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() {
|
let alternatives = match self
|
||||||
|
.index
|
||||||
|
.synonyms
|
||||||
|
.synonyms(&writer, word.as_bytes())
|
||||||
|
.unwrap()
|
||||||
|
{
|
||||||
Some(alternatives) => alternatives,
|
Some(alternatives) => alternatives,
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let new = sdset_into_fstset(&new);
|
let new = sdset_into_fstset(&new);
|
||||||
let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union());
|
let new_alternatives =
|
||||||
self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap();
|
set_from_stream(alternatives.op().add(new.into_stream()).r#union());
|
||||||
|
self.index
|
||||||
|
.synonyms
|
||||||
|
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
|
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
|
||||||
Some(synonyms) => synonyms,
|
Some(synonyms) => synonyms,
|
||||||
@ -702,14 +733,17 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
|
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
|
||||||
self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap();
|
self.index
|
||||||
|
.main
|
||||||
|
.put_synonyms_fst(&mut writer, &synonyms_fst)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase {
|
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase {
|
||||||
fn from_iter<I: IntoIterator<Item=(&'a str, &'a [DocIndex])>>(iter: I) -> Self {
|
fn from_iter<I: IntoIterator<Item = (&'a str, &'a [DocIndex])>>(iter: I) -> Self {
|
||||||
let tempdir = TempDir::new().unwrap();
|
let tempdir = TempDir::new().unwrap();
|
||||||
let database = Database::open_or_create(&tempdir).unwrap();
|
let database = Database::open_or_create(&tempdir).unwrap();
|
||||||
let index = database.create_index("default").unwrap();
|
let index = database.create_index("default").unwrap();
|
||||||
@ -724,7 +758,10 @@ mod tests {
|
|||||||
for (word, indexes) in iter {
|
for (word, indexes) in iter {
|
||||||
let word = word.to_lowercase().into_bytes();
|
let word = word.to_lowercase().into_bytes();
|
||||||
words_fst.insert(word.clone());
|
words_fst.insert(word.clone());
|
||||||
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
|
postings_lists
|
||||||
|
.entry(word)
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.extend_from_slice(indexes);
|
||||||
for idx in indexes {
|
for idx in indexes {
|
||||||
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
|
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
|
||||||
}
|
}
|
||||||
@ -736,31 +773,33 @@ mod tests {
|
|||||||
|
|
||||||
for (word, postings_list) in postings_lists {
|
for (word, postings_list) in postings_lists {
|
||||||
let postings_list = SetBuf::from_dirty(postings_list);
|
let postings_list = SetBuf::from_dirty(postings_list);
|
||||||
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
|
index
|
||||||
|
.postings_lists
|
||||||
|
.put_postings_list(&mut writer, &word, &postings_list)
|
||||||
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
for ((docid, attr, _), count) in fields_counts {
|
for ((docid, attr, _), count) in fields_counts {
|
||||||
let prev = index.documents_fields_counts
|
let prev = index
|
||||||
.document_field_count(
|
.documents_fields_counts
|
||||||
&mut writer,
|
.document_field_count(&mut writer, docid, SchemaAttr(attr))
|
||||||
docid,
|
.unwrap();
|
||||||
SchemaAttr(attr),
|
|
||||||
).unwrap();
|
|
||||||
|
|
||||||
let prev = prev.unwrap_or(0);
|
let prev = prev.unwrap_or(0);
|
||||||
|
|
||||||
index.documents_fields_counts
|
index
|
||||||
.put_document_field_count(
|
.documents_fields_counts
|
||||||
&mut writer,
|
.put_document_field_count(&mut writer, docid, SchemaAttr(attr), prev + count)
|
||||||
docid,
|
.unwrap();
|
||||||
SchemaAttr(attr),
|
|
||||||
prev + count,
|
|
||||||
).unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
|
|
||||||
TempDatabase { database, index, _tempdir: tempdir }
|
TempDatabase {
|
||||||
|
database,
|
||||||
|
index,
|
||||||
|
_tempdir: tempdir,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -768,8 +807,8 @@ mod tests {
|
|||||||
fn simple() {
|
fn simple() {
|
||||||
let store = TempDatabase::from_iter(vec![
|
let store = TempDatabase::from_iter(vec![
|
||||||
("iphone", &[doc_char_index(0, 0, 0)][..]),
|
("iphone", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("from", &[doc_char_index(0, 1, 1)][..]),
|
("from", &[doc_char_index(0, 1, 1)][..]),
|
||||||
("apple", &[doc_char_index(0, 2, 2)][..]),
|
("apple", &[doc_char_index(0, 2, 2)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
@ -791,9 +830,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_synonyms() {
|
fn simple_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||||
("hello", &[doc_index(0, 0)][..]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||||
|
|
||||||
@ -825,9 +862,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn prefix_synonyms() {
|
fn prefix_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||||
("hello", &[doc_index(0, 0)][..]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||||
@ -872,9 +907,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn levenshtein_synonyms() {
|
fn levenshtein_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||||
("hello", &[doc_index(0, 0)][..]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||||
|
|
||||||
@ -907,9 +940,9 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn harder_synonyms() {
|
fn harder_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("hello", &[doc_index(0, 0)][..]),
|
("hello", &[doc_index(0, 0)][..]),
|
||||||
("bonjour", &[doc_index(1, 3)]),
|
("bonjour", &[doc_index(1, 3)]),
|
||||||
("salut", &[doc_index(2, 5)]),
|
("salut", &[doc_index(2, 5)]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
|
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
|
||||||
@ -987,17 +1020,22 @@ mod tests {
|
|||||||
/// Unique word has multi-word synonyms
|
/// Unique word has multi-word synonyms
|
||||||
fn unique_to_multiword_synonyms() {
|
fn unique_to_multiword_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||||
("subway", &[doc_char_index(0, 3, 3)][..]),
|
("subway", &[doc_char_index(0, 3, 3)][..]),
|
||||||
|
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
|
||||||
("subway", &[doc_char_index(1, 1, 1)][..]),
|
("subway", &[doc_char_index(1, 1, 1)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
store.add_synonym(
|
||||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
"NY",
|
||||||
|
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||||
|
);
|
||||||
|
store.add_synonym(
|
||||||
|
"NYC",
|
||||||
|
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||||
|
);
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
@ -1056,20 +1094,18 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn unique_to_multiword_synonyms_words_proximity() {
|
fn unique_to_multiword_synonyms_words_proximity() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||||
("subway", &[doc_char_index(0, 3, 3)][..]),
|
("subway", &[doc_char_index(0, 3, 3)][..]),
|
||||||
|
("york", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("york", &[doc_char_index(1, 0, 0)][..]),
|
("new", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("new", &[doc_char_index(1, 1, 1)][..]),
|
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
|
("NY", &[doc_char_index(2, 0, 0)][..]),
|
||||||
("NY", &[doc_char_index(2, 0, 0)][..]),
|
|
||||||
("subway", &[doc_char_index(2, 1, 1)][..]),
|
("subway", &[doc_char_index(2, 1, 1)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
|
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
@ -1120,11 +1156,10 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn unique_to_multiword_synonyms_cumulative_word_index() {
|
fn unique_to_multiword_synonyms_cumulative_word_index() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("new", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("new", &[doc_char_index(1, 0, 0)][..]),
|
("york", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("york", &[doc_char_index(1, 1, 1)][..]),
|
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -1175,20 +1210,25 @@ mod tests {
|
|||||||
/// Unique word has multi-word synonyms
|
/// Unique word has multi-word synonyms
|
||||||
fn harder_unique_to_multiword_synonyms_one() {
|
fn harder_unique_to_multiword_synonyms_one() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||||
("subway", &[doc_char_index(0, 4, 4)][..]),
|
("subway", &[doc_char_index(0, 4, 4)][..]),
|
||||||
("broken", &[doc_char_index(0, 5, 5)][..]),
|
("broken", &[doc_char_index(0, 5, 5)][..]),
|
||||||
|
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
store.add_synonym(
|
||||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
"NY",
|
||||||
|
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||||
|
);
|
||||||
|
store.add_synonym(
|
||||||
|
"NYC",
|
||||||
|
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||||
|
);
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
@ -1249,21 +1289,26 @@ mod tests {
|
|||||||
/// Unique word has multi-word synonyms
|
/// Unique word has multi-word synonyms
|
||||||
fn even_harder_unique_to_multiword_synonyms() {
|
fn even_harder_unique_to_multiword_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("new", &[doc_char_index(0, 0, 0)][..]),
|
("new", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("york", &[doc_char_index(0, 1, 1)][..]),
|
("york", &[doc_char_index(0, 1, 1)][..]),
|
||||||
("city", &[doc_char_index(0, 2, 2)][..]),
|
("city", &[doc_char_index(0, 2, 2)][..]),
|
||||||
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
("yellow", &[doc_char_index(0, 3, 3)][..]),
|
||||||
("underground", &[doc_char_index(0, 4, 4)][..]),
|
("underground", &[doc_char_index(0, 4, 4)][..]),
|
||||||
("train", &[doc_char_index(0, 5, 5)][..]),
|
("train", &[doc_char_index(0, 5, 5)][..]),
|
||||||
("broken", &[doc_char_index(0, 6, 6)][..]),
|
("broken", &[doc_char_index(0, 6, 6)][..]),
|
||||||
|
("NY", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("NY", &[doc_char_index(1, 0, 0)][..]),
|
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]));
|
store.add_synonym(
|
||||||
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"]));
|
"NY",
|
||||||
|
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
|
||||||
|
);
|
||||||
|
store.add_synonym(
|
||||||
|
"NYC",
|
||||||
|
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
|
||||||
|
);
|
||||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
@ -1330,30 +1375,36 @@ mod tests {
|
|||||||
/// Multi-word has multi-word synonyms
|
/// Multi-word has multi-word synonyms
|
||||||
fn multiword_to_multiword_synonyms() {
|
fn multiword_to_multiword_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("NY", &[doc_char_index(0, 0, 0)][..]),
|
("NY", &[doc_char_index(0, 0, 0)][..]),
|
||||||
("subway", &[doc_char_index(0, 1, 1)][..]),
|
("subway", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("NYC", &[doc_char_index(1, 0, 0)][..]),
|
||||||
("NYC", &[doc_char_index(1, 0, 0)][..]),
|
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
("new", &[doc_char_index(2, 0, 0)][..]),
|
||||||
|
("york", &[doc_char_index(2, 1, 1)][..]),
|
||||||
("new", &[doc_char_index(2, 0, 0)][..]),
|
|
||||||
("york", &[doc_char_index(2, 1, 1)][..]),
|
|
||||||
("underground", &[doc_char_index(2, 2, 2)][..]),
|
("underground", &[doc_char_index(2, 2, 2)][..]),
|
||||||
("train", &[doc_char_index(2, 3, 3)][..]),
|
("train", &[doc_char_index(2, 3, 3)][..]),
|
||||||
("broken", &[doc_char_index(2, 4, 4)][..]),
|
("broken", &[doc_char_index(2, 4, 4)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ]));
|
store.add_synonym(
|
||||||
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ]));
|
"new york",
|
||||||
store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ]));
|
SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]),
|
||||||
|
);
|
||||||
|
store.add_synonym(
|
||||||
|
"new york city",
|
||||||
|
SetBuf::from_dirty(vec!["NYC", "NY", "new york"]),
|
||||||
|
);
|
||||||
|
store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"]));
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap();
|
let results = builder
|
||||||
|
.query(&reader, "new york underground train broken", 0..20)
|
||||||
|
.unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||||
@ -1390,7 +1441,9 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap();
|
let results = builder
|
||||||
|
.query(&reader, "new york city underground train broken", 0..20)
|
||||||
|
.unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||||
@ -1436,14 +1489,14 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn intercrossed_multiword_synonyms() {
|
fn intercrossed_multiword_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("new", &[doc_index(0, 0)][..]),
|
("new", &[doc_index(0, 0)][..]),
|
||||||
("york", &[doc_index(0, 1)][..]),
|
("york", &[doc_index(0, 1)][..]),
|
||||||
("big", &[doc_index(0, 2)][..]),
|
("big", &[doc_index(0, 2)][..]),
|
||||||
("city", &[doc_index(0, 3)][..]),
|
("city", &[doc_index(0, 3)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ]));
|
store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"]));
|
||||||
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ]));
|
store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"]));
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
@ -1469,16 +1522,14 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("NY", &[doc_index(0, 0)][..]),
|
("NY", &[doc_index(0, 0)][..]),
|
||||||
("city", &[doc_index(0, 1)][..]),
|
("city", &[doc_index(0, 1)][..]),
|
||||||
("subway", &[doc_index(0, 2)][..]),
|
("subway", &[doc_index(0, 2)][..]),
|
||||||
|
("NY", &[doc_index(1, 0)][..]),
|
||||||
("NY", &[doc_index(1, 0)][..]),
|
|
||||||
("subway", &[doc_index(1, 1)][..]),
|
("subway", &[doc_index(1, 1)][..]),
|
||||||
|
("NY", &[doc_index(2, 0)][..]),
|
||||||
("NY", &[doc_index(2, 0)][..]),
|
("york", &[doc_index(2, 1)][..]),
|
||||||
("york", &[doc_index(2, 1)][..]),
|
("city", &[doc_index(2, 2)][..]),
|
||||||
("city", &[doc_index(2, 2)][..]),
|
|
||||||
("subway", &[doc_index(2, 3)][..]),
|
("subway", &[doc_index(2, 3)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -1525,20 +1576,22 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn cumulative_word_indices() {
|
fn cumulative_word_indices() {
|
||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("NYC", &[doc_index(0, 0)][..]),
|
("NYC", &[doc_index(0, 0)][..]),
|
||||||
("long", &[doc_index(0, 1)][..]),
|
("long", &[doc_index(0, 1)][..]),
|
||||||
("subway", &[doc_index(0, 2)][..]),
|
("subway", &[doc_index(0, 2)][..]),
|
||||||
("cool", &[doc_index(0, 3)][..]),
|
("cool", &[doc_index(0, 3)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"]));
|
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"]));
|
||||||
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
let reader = env.read_txn().unwrap();
|
let reader = env.read_txn().unwrap();
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap();
|
let results = builder
|
||||||
|
.query(&reader, "new york city long subway cool ", 0..20)
|
||||||
|
.unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
@ -1560,8 +1613,7 @@ mod tests {
|
|||||||
let mut store = TempDatabase::from_iter(vec![
|
let mut store = TempDatabase::from_iter(vec![
|
||||||
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
|
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
|
||||||
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
|
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
|
||||||
|
("iphone", &[doc_index(1, 0)][..]),
|
||||||
("iphone", &[doc_index(1, 0)][..]),
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"]));
|
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"]));
|
||||||
@ -1624,8 +1676,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn simple_concatenation() {
|
fn simple_concatenation() {
|
||||||
let store = TempDatabase::from_iter(vec![
|
let store = TempDatabase::from_iter(vec![
|
||||||
("iphone", &[doc_index(0, 0)][..]),
|
("iphone", &[doc_index(0, 0)][..]),
|
||||||
("case", &[doc_index(0, 1)][..]),
|
("case", &[doc_index(0, 1)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let env = &store.database.env;
|
let env = &store.database.env;
|
||||||
|
@ -2,12 +2,11 @@ use std::io::{Read, Write};
|
|||||||
|
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::{DocumentId, Number};
|
use crate::{DocumentId, Number};
|
||||||
|
|
||||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
|
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::{TmpMatch, DocumentId, Highlight};
|
use crate::{DocumentId, Highlight, TmpMatch};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct RawDocument {
|
pub struct RawDocument {
|
||||||
@ -20,7 +20,13 @@ impl RawDocument {
|
|||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
// can only be done in this module
|
// can only be done in this module
|
||||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
unsafe {
|
||||||
|
&self
|
||||||
|
.matches
|
||||||
|
.matches
|
||||||
|
.query_index
|
||||||
|
.get_unchecked(r.start..r.end)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn distance(&self) -> &[u8] {
|
pub fn distance(&self) -> &[u8] {
|
||||||
@ -41,7 +47,13 @@ impl RawDocument {
|
|||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
// can only be done in this module
|
// can only be done in this module
|
||||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
unsafe {
|
||||||
|
&self
|
||||||
|
.matches
|
||||||
|
.matches
|
||||||
|
.word_index
|
||||||
|
.get_unchecked(r.start..r.end)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_exact(&self) -> &[bool] {
|
pub fn is_exact(&self) -> &[bool] {
|
||||||
@ -55,12 +67,32 @@ impl RawDocument {
|
|||||||
impl fmt::Debug for RawDocument {
|
impl fmt::Debug for RawDocument {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
f.write_str("RawDocument {\r\n")?;
|
f.write_str("RawDocument {\r\n")?;
|
||||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
|
f.write_fmt(format_args!(
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
|
"{:>15}: {:^5?},\r\n",
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
|
"query_index",
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
|
self.query_index()
|
||||||
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
|
))?;
|
||||||
|
f.write_fmt(format_args!(
|
||||||
|
"{:>15}: {:^5?},\r\n",
|
||||||
|
"distance",
|
||||||
|
self.distance()
|
||||||
|
))?;
|
||||||
|
f.write_fmt(format_args!(
|
||||||
|
"{:>15}: {:^5?},\r\n",
|
||||||
|
"attribute",
|
||||||
|
self.attribute()
|
||||||
|
))?;
|
||||||
|
f.write_fmt(format_args!(
|
||||||
|
"{:>15}: {:^5?},\r\n",
|
||||||
|
"word_index",
|
||||||
|
self.word_index()
|
||||||
|
))?;
|
||||||
|
f.write_fmt(format_args!(
|
||||||
|
"{:>15}: {:^5?},\r\n",
|
||||||
|
"is_exact",
|
||||||
|
self.is_exact()
|
||||||
|
))?;
|
||||||
f.write_str("}")?;
|
f.write_str("}")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -70,8 +102,7 @@ pub fn raw_documents_from(
|
|||||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||||
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||||
) -> Vec<RawDocument>
|
) -> Vec<RawDocument> {
|
||||||
{
|
|
||||||
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
@ -94,10 +125,21 @@ pub fn raw_documents_from(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
let matches = Arc::new(matches2);
|
||||||
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
|
docs_ranges
|
||||||
let matches = SharedMatches { range, matches: matches.clone() };
|
.into_iter()
|
||||||
RawDocument { id, matches, highlights, fields_counts }
|
.map(|(id, range, highlights, fields_counts)| {
|
||||||
}).collect()
|
let matches = SharedMatches {
|
||||||
|
range,
|
||||||
|
matches: matches.clone(),
|
||||||
|
};
|
||||||
|
RawDocument {
|
||||||
|
id,
|
||||||
|
matches,
|
||||||
|
highlights,
|
||||||
|
fields_counts,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
|
use crate::{DocIndex, DocumentId};
|
||||||
use deunicode::deunicode_with_tofu;
|
use deunicode::deunicode_with_tofu;
|
||||||
use crate::{DocumentId, DocIndex};
|
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
@ -60,7 +60,9 @@ impl RawIndexer {
|
|||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
number_of_words += 1;
|
number_of_words += 1;
|
||||||
}
|
}
|
||||||
@ -70,8 +72,9 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
|
||||||
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
|
where
|
||||||
IT: Iterator<Item = &'a str> + Clone,
|
I: IntoIterator<Item = &'a str, IntoIter = IT>,
|
||||||
|
IT: Iterator<Item = &'a str> + Clone,
|
||||||
{
|
{
|
||||||
// TODO serialize this to one call to the SeqTokenizer loop
|
// TODO serialize this to one call to the SeqTokenizer loop
|
||||||
|
|
||||||
@ -88,14 +91,25 @@ impl RawIndexer {
|
|||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
|
let deunicoded: Vec<_> = lowercased
|
||||||
if lowercase_text.contains(is_cjk) { return lowercase_text }
|
.into_iter()
|
||||||
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
.map(|lowercase_text| {
|
||||||
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
|
if lowercase_text.contains(is_cjk) {
|
||||||
}).collect();
|
return lowercase_text;
|
||||||
|
}
|
||||||
|
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
|
||||||
|
if lowercase_text != deunicoded {
|
||||||
|
deunicoded
|
||||||
|
} else {
|
||||||
|
lowercase_text
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
let iter = deunicoded.iter().map(|t| t.as_str());
|
let iter = deunicoded.iter().map(|t| t.as_str());
|
||||||
|
|
||||||
for token in SeqTokenizer::new(iter) {
|
for token in SeqTokenizer::new(iter) {
|
||||||
@ -108,17 +122,21 @@ impl RawIndexer {
|
|||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
|
|
||||||
if !must_continue { break }
|
if !must_continue {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Indexed {
|
pub fn build(self) -> Indexed {
|
||||||
let words_doc_indexes = self.words_doc_indexes
|
let words_doc_indexes = self
|
||||||
|
.words_doc_indexes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let docs_words = self.docs_words
|
let docs_words = self
|
||||||
|
.docs_words
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, mut words)| {
|
.map(|(id, mut words)| {
|
||||||
words.sort_unstable();
|
words.sort_unstable();
|
||||||
@ -127,7 +145,10 @@ impl RawIndexer {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
Indexed { words_doc_indexes, docs_words }
|
Indexed {
|
||||||
|
words_doc_indexes,
|
||||||
|
docs_words,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,16 +159,20 @@ fn index_token(
|
|||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool
|
) -> bool {
|
||||||
{
|
if token.word_index >= word_limit {
|
||||||
if token.word_index >= word_limit { return false }
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
|
words_doc_indexes
|
||||||
|
.entry(word.clone())
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push(docindex);
|
||||||
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
||||||
},
|
}
|
||||||
None => return false,
|
None => return false,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,7 +208,9 @@ mod tests {
|
|||||||
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||||
indexer.index_text(docid, attr, text);
|
indexer.index_text(docid, attr, text);
|
||||||
|
|
||||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
let Indexed {
|
||||||
|
words_doc_indexes, ..
|
||||||
|
} = indexer.build();
|
||||||
|
|
||||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||||
@ -191,7 +218,9 @@ mod tests {
|
|||||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
|
|
||||||
// with the ugly apostrophe...
|
// with the ugly apostrophe...
|
||||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
assert!(words_doc_indexes
|
||||||
|
.get(&"l’éteindre".to_owned().into_bytes())
|
||||||
|
.is_some());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -203,7 +232,9 @@ mod tests {
|
|||||||
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
||||||
indexer.index_text_seq(docid, attr, text);
|
indexer.index_text_seq(docid, attr, text);
|
||||||
|
|
||||||
let Indexed { words_doc_indexes, .. } = indexer.build();
|
let Indexed {
|
||||||
|
words_doc_indexes, ..
|
||||||
|
} = indexer.build();
|
||||||
|
|
||||||
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
||||||
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
||||||
@ -211,6 +242,8 @@ mod tests {
|
|||||||
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
||||||
|
|
||||||
// with the ugly apostrophe...
|
// with the ugly apostrophe...
|
||||||
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
|
assert!(words_doc_indexes
|
||||||
|
.get(&"l’éteindre".to_owned().into_bytes())
|
||||||
|
.is_some());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,10 @@ pub struct ReorderedAttrs {
|
|||||||
|
|
||||||
impl ReorderedAttrs {
|
impl ReorderedAttrs {
|
||||||
pub fn new() -> ReorderedAttrs {
|
pub fn new() -> ReorderedAttrs {
|
||||||
ReorderedAttrs { count: 0, reorders: Vec::new() }
|
ReorderedAttrs {
|
||||||
|
count: 0,
|
||||||
|
reorders: Vec::new(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert_attribute(&mut self, attribute: u16) {
|
pub fn insert_attribute(&mut self, attribute: u16) {
|
||||||
|
@ -77,13 +77,18 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
Err(SerializerError::UnrankableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnrankableType { type_name: "Option" })
|
Err(SerializerError::UnrankableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -91,25 +96,29 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnrankableType { type_name: "unit struct" })
|
Err(SerializerError::UnrankableType {
|
||||||
|
type_name: "unit struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
fn serialize_unit_variant(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str
|
_variant: &'static str,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnrankableType {
|
||||||
Err(SerializerError::UnrankableType { type_name: "unit variant" })
|
type_name: "unit variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
value.serialize(self)
|
value.serialize(self)
|
||||||
}
|
}
|
||||||
@ -119,15 +128,20 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_value: &T
|
_value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
|
Err(SerializerError::UnrankableType {
|
||||||
|
type_name: "newtype variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
Err(SerializerError::UnrankableType { type_name: "sequence" })
|
Err(SerializerError::UnrankableType {
|
||||||
|
type_name: "sequence",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
@ -137,10 +151,11 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
fn serialize_tuple_struct(
|
fn serialize_tuple_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnrankableType {
|
||||||
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
|
type_name: "tuple struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
fn serialize_tuple_variant(
|
||||||
@ -148,10 +163,11 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnrankableType {
|
||||||
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
|
type_name: "tuple variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
@ -161,10 +177,11 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
fn serialize_struct(
|
fn serialize_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStruct, Self::Error>
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnrankableType {
|
||||||
Err(SerializerError::UnrankableType { type_name: "struct" })
|
type_name: "struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_struct_variant(
|
fn serialize_struct_variant(
|
||||||
@ -172,9 +189,10 @@ impl ser::Serializer for ConvertToNumber {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnrankableType {
|
||||||
Err(SerializerError::UnrankableType { type_name: "struct variant" })
|
type_name: "struct variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use serde::Serialize;
|
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
use super::SerializerError;
|
use super::SerializerError;
|
||||||
|
|
||||||
@ -17,7 +17,9 @@ impl ser::Serializer for ConvertToString {
|
|||||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "boolean" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "boolean",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -73,13 +75,18 @@ impl ser::Serializer for ConvertToString {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -87,25 +94,29 @@ impl ser::Serializer for ConvertToString {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "unit struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
fn serialize_unit_variant(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str
|
_variant: &'static str,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
type_name: "unit variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
value.serialize(self)
|
value.serialize(self)
|
||||||
}
|
}
|
||||||
@ -115,15 +126,20 @@ impl ser::Serializer for ConvertToString {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_value: &T
|
_value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "newtype variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "sequence",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
@ -133,10 +149,11 @@ impl ser::Serializer for ConvertToString {
|
|||||||
fn serialize_tuple_struct(
|
fn serialize_tuple_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
type_name: "tuple struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
fn serialize_tuple_variant(
|
||||||
@ -144,10 +161,11 @@ impl ser::Serializer for ConvertToString {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
type_name: "tuple variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
@ -157,10 +175,11 @@ impl ser::Serializer for ConvertToString {
|
|||||||
fn serialize_struct(
|
fn serialize_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStruct, Self::Error>
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "struct" })
|
type_name: "struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_struct_variant(
|
fn serialize_struct_variant(
|
||||||
@ -168,9 +187,10 @@ impl ser::Serializer for ConvertToString {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
type_name: "struct variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::{fmt, error::Error};
|
use std::{error::Error, fmt};
|
||||||
|
|
||||||
use meilidb_schema::{Schema, SchemaAttr};
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
use serde_json::Error as SerdeJsonError;
|
|
||||||
use serde_json::Deserializer as SerdeJsonDeserializer;
|
|
||||||
use serde_json::de::IoRead as SerdeJsonIoRead;
|
|
||||||
use serde::{de, forward_to_deserialize_any};
|
use serde::{de, forward_to_deserialize_any};
|
||||||
|
use serde_json::de::IoRead as SerdeJsonIoRead;
|
||||||
|
use serde_json::Deserializer as SerdeJsonDeserializer;
|
||||||
|
use serde_json::Error as SerdeJsonError;
|
||||||
|
|
||||||
use crate::store::DocumentsFields;
|
use crate::store::DocumentsFields;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
@ -60,7 +60,8 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
|||||||
type Error = DeserializerError;
|
type Error = DeserializerError;
|
||||||
|
|
||||||
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
where V: de::Visitor<'de>
|
where
|
||||||
|
V: de::Visitor<'de>,
|
||||||
{
|
{
|
||||||
self.deserialize_map(visitor)
|
self.deserialize_map(visitor)
|
||||||
}
|
}
|
||||||
@ -72,16 +73,21 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
where V: de::Visitor<'de>
|
where
|
||||||
|
V: de::Visitor<'de>,
|
||||||
{
|
{
|
||||||
let mut error = None;
|
let mut error = None;
|
||||||
|
|
||||||
let iter = self.documents_fields
|
let iter = self
|
||||||
|
.documents_fields
|
||||||
.document_fields(self.reader, self.document_id)?
|
.document_fields(self.reader, self.document_id)?
|
||||||
.filter_map(|result| {
|
.filter_map(|result| {
|
||||||
let (attr, value) = match result {
|
let (attr, value) = match result {
|
||||||
Ok(value) => value,
|
Ok(value) => value,
|
||||||
Err(e) => { error = Some(e); return None },
|
Err(e) => {
|
||||||
|
error = Some(e);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_displayed = self.schema.props(attr).is_displayed();
|
let is_displayed = self.schema.props(attr).is_displayed();
|
||||||
@ -99,7 +105,9 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let map_deserializer = de::value::MapDeserializer::new(iter);
|
let map_deserializer = de::value::MapDeserializer::new(iter);
|
||||||
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
|
let result = visitor
|
||||||
|
.visit_map(map_deserializer)
|
||||||
|
.map_err(DeserializerError::from);
|
||||||
|
|
||||||
match error.take() {
|
match error.take() {
|
||||||
Some(error) => Err(error.into()),
|
Some(error) => Err(error.into()),
|
||||||
@ -122,7 +130,8 @@ impl<'de> de::Deserializer<'de> for Value {
|
|||||||
type Error = SerdeJsonError;
|
type Error = SerdeJsonError;
|
||||||
|
|
||||||
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
where V: de::Visitor<'de>
|
where
|
||||||
|
V: de::Visitor<'de>,
|
||||||
{
|
{
|
||||||
self.0.deserialize_any(visitor)
|
self.0.deserialize_any(visitor)
|
||||||
}
|
}
|
||||||
|
@ -5,13 +5,14 @@ use serde::{ser, Serialize};
|
|||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use siphasher::sip::SipHasher;
|
use siphasher::sip::SipHasher;
|
||||||
|
|
||||||
use super::{SerializerError, ConvertToString};
|
use super::{ConvertToString, SerializerError};
|
||||||
|
|
||||||
pub fn extract_document_id<D>(
|
pub fn extract_document_id<D>(
|
||||||
identifier: &str,
|
identifier: &str,
|
||||||
document: &D,
|
document: &D,
|
||||||
) -> Result<Option<DocumentId>, SerializerError>
|
) -> Result<Option<DocumentId>, SerializerError>
|
||||||
where D: serde::Serialize,
|
where
|
||||||
|
D: serde::Serialize,
|
||||||
{
|
{
|
||||||
let serializer = ExtractDocumentId { identifier };
|
let serializer = ExtractDocumentId { identifier };
|
||||||
document.serialize(serializer)
|
document.serialize(serializer)
|
||||||
@ -77,13 +78,18 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -91,25 +97,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "unit struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
fn serialize_unit_variant(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str
|
_variant: &'static str,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
type_name: "unit variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
value.serialize(self)
|
value.serialize(self)
|
||||||
}
|
}
|
||||||
@ -119,15 +129,20 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_value: &T
|
_value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "newtype variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "sequence",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
@ -137,10 +152,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
fn serialize_tuple_struct(
|
fn serialize_tuple_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
type_name: "tuple struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
fn serialize_tuple_variant(
|
||||||
@ -148,10 +164,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
type_name: "tuple variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
@ -167,9 +184,8 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
fn serialize_struct(
|
fn serialize_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStruct, Self::Error>
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||||
{
|
|
||||||
let serializer = ExtractDocumentIdStructSerializer {
|
let serializer = ExtractDocumentIdStructSerializer {
|
||||||
identifier: self.identifier,
|
identifier: self.identifier,
|
||||||
document_id: None,
|
document_id: None,
|
||||||
@ -183,10 +199,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
type_name: "struct variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,7 +218,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
|||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
let key = key.serialize(ConvertToString)?;
|
let key = key.serialize(ConvertToString)?;
|
||||||
self.current_key_name = Some(key);
|
self.current_key_name = Some(key);
|
||||||
@ -209,7 +227,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
let key = self.current_key_name.take().unwrap();
|
let key = self.current_key_name.take().unwrap();
|
||||||
self.serialize_entry(&key, value)
|
self.serialize_entry(&key, value)
|
||||||
@ -218,9 +237,11 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
|
|||||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: &K,
|
key: &K,
|
||||||
value: &V
|
value: &V,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where K: Serialize, V: Serialize,
|
where
|
||||||
|
K: Serialize,
|
||||||
|
V: Serialize,
|
||||||
{
|
{
|
||||||
let key = key.serialize(ConvertToString)?;
|
let key = key.serialize(ConvertToString)?;
|
||||||
|
|
||||||
@ -252,9 +273,10 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
|
|||||||
fn serialize_field<T: ?Sized>(
|
fn serialize_field<T: ?Sized>(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: &'static str,
|
key: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where T: Serialize,
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
if self.identifier == key {
|
if self.identifier == key {
|
||||||
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
|
||||||
|
@ -2,9 +2,9 @@ use meilidb_schema::SchemaAttr;
|
|||||||
use serde::ser;
|
use serde::ser;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
use crate::DocumentId;
|
use super::{ConvertToString, SerializerError};
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use super::{SerializerError, ConvertToString};
|
use crate::DocumentId;
|
||||||
|
|
||||||
pub struct Indexer<'a> {
|
pub struct Indexer<'a> {
|
||||||
pub attribute: SchemaAttr,
|
pub attribute: SchemaAttr,
|
||||||
@ -24,7 +24,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnindexableType { type_name: "boolean" })
|
Err(SerializerError::UnindexableType {
|
||||||
|
type_name: "boolean",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -83,7 +85,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
|
let number_of_words = self
|
||||||
|
.indexer
|
||||||
|
.index_text(self.document_id, self.attribute, text);
|
||||||
Ok(Some(number_of_words))
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,14 +96,19 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnindexableType { type_name: "Option" })
|
Err(SerializerError::UnindexableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
|
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
|
let number_of_words = self
|
||||||
|
.indexer
|
||||||
|
.index_text(self.document_id, self.attribute, &text);
|
||||||
Ok(Some(number_of_words))
|
Ok(Some(number_of_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,25 +117,29 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnindexableType { type_name: "unit struct" })
|
Err(SerializerError::UnindexableType {
|
||||||
|
type_name: "unit struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
fn serialize_unit_variant(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str
|
_variant: &'static str,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnindexableType {
|
||||||
Err(SerializerError::UnindexableType { type_name: "unit variant" })
|
type_name: "unit variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
value.serialize(self)
|
value.serialize(self)
|
||||||
}
|
}
|
||||||
@ -136,11 +149,14 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_value: &T
|
_value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
|
Err(SerializerError::UnindexableType {
|
||||||
|
type_name: "newtype variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
@ -168,10 +184,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
fn serialize_tuple_struct(
|
fn serialize_tuple_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnindexableType {
|
||||||
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
|
type_name: "tuple struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
fn serialize_tuple_variant(
|
||||||
@ -179,10 +196,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnindexableType {
|
||||||
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
|
type_name: "tuple variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
@ -199,10 +217,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
fn serialize_struct(
|
fn serialize_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStruct, Self::Error>
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnindexableType {
|
||||||
Err(SerializerError::UnindexableType { type_name: "struct" })
|
type_name: "struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_struct_variant(
|
fn serialize_struct_variant(
|
||||||
@ -210,10 +229,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnindexableType {
|
||||||
Err(SerializerError::UnindexableType { type_name: "struct variant" })
|
type_name: "struct variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,7 +249,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
|||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
self.texts.push(text);
|
self.texts.push(text);
|
||||||
@ -238,7 +259,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
|
|||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer
|
||||||
|
.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -255,7 +277,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
|||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = key.serialize(ConvertToString)?;
|
let text = key.serialize(ConvertToString)?;
|
||||||
self.texts.push(text);
|
self.texts.push(text);
|
||||||
@ -263,7 +286,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
self.texts.push(text);
|
self.texts.push(text);
|
||||||
@ -272,7 +296,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
|||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer
|
||||||
|
.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -293,7 +318,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
key: &'static str,
|
key: &'static str,
|
||||||
value: &T,
|
value: &T,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let key_text = key.to_owned();
|
let key_text = key.to_owned();
|
||||||
let value_text = value.serialize(ConvertToString)?;
|
let value_text = value.serialize(ConvertToString)?;
|
||||||
@ -304,7 +330,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer
|
||||||
|
.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -321,7 +348,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
|||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
where T: Serialize
|
where
|
||||||
|
T: Serialize,
|
||||||
{
|
{
|
||||||
let text = value.serialize(ConvertToString)?;
|
let text = value.serialize(ConvertToString)?;
|
||||||
self.texts.push(text);
|
self.texts.push(text);
|
||||||
@ -330,7 +358,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
|
|||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
let texts = self.texts.iter().map(String::as_str);
|
let texts = self.texts.iter().map(String::as_str);
|
||||||
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
|
self.indexer
|
||||||
|
.index_text_seq(self.document_id, self.attribute, texts);
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,19 +15,19 @@ mod extract_document_id;
|
|||||||
mod indexer;
|
mod indexer;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
|
|
||||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
|
||||||
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
|
|
||||||
pub use self::convert_to_string::ConvertToString;
|
|
||||||
pub use self::convert_to_number::ConvertToNumber;
|
pub use self::convert_to_number::ConvertToNumber;
|
||||||
|
pub use self::convert_to_string::ConvertToString;
|
||||||
|
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||||
|
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
|
||||||
pub use self::indexer::Indexer;
|
pub use self::indexer::Indexer;
|
||||||
pub use self::serializer::Serializer;
|
pub use self::serializer::Serializer;
|
||||||
|
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::{fmt, error::Error};
|
use std::{error::Error, fmt};
|
||||||
|
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use serde_json::Error as SerdeJsonError;
|
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
use serde_json::Error as SerdeJsonError;
|
||||||
|
|
||||||
use crate::{DocumentId, ParseNumberError};
|
use crate::{DocumentId, ParseNumberError};
|
||||||
|
|
||||||
@ -55,24 +55,24 @@ impl fmt::Display for SerializerError {
|
|||||||
match self {
|
match self {
|
||||||
SerializerError::DocumentIdNotFound => {
|
SerializerError::DocumentIdNotFound => {
|
||||||
f.write_str("serialized document does not have an id according to the schema")
|
f.write_str("serialized document does not have an id according to the schema")
|
||||||
},
|
}
|
||||||
SerializerError::InvalidDocumentIdType => {
|
SerializerError::InvalidDocumentIdType => {
|
||||||
f.write_str("document identifier can only be of type string or number")
|
f.write_str("document identifier can only be of type string or number")
|
||||||
},
|
}
|
||||||
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
|
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
|
||||||
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
|
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
|
||||||
SerializerError::ParseNumber(e) => {
|
SerializerError::ParseNumber(e) => {
|
||||||
write!(f, "error while trying to parse a number: {}", e)
|
write!(f, "error while trying to parse a number: {}", e)
|
||||||
},
|
}
|
||||||
SerializerError::UnserializableType { type_name } => {
|
SerializerError::UnserializableType { type_name } => {
|
||||||
write!(f, "{} is not a serializable type", type_name)
|
write!(f, "{} is not a serializable type", type_name)
|
||||||
},
|
}
|
||||||
SerializerError::UnindexableType { type_name } => {
|
SerializerError::UnindexableType { type_name } => {
|
||||||
write!(f, "{} is not an indexable type", type_name)
|
write!(f, "{} is not an indexable type", type_name)
|
||||||
},
|
}
|
||||||
SerializerError::UnrankableType { type_name } => {
|
SerializerError::UnrankableType { type_name } => {
|
||||||
write!(f, "{} types can not be used for ranking", type_name)
|
write!(f, "{} types can not be used for ranking", type_name)
|
||||||
},
|
}
|
||||||
SerializerError::Custom(s) => f.write_str(s),
|
SerializerError::Custom(s) => f.write_str(s),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use meilidb_schema::{Schema, SchemaAttr};
|
use meilidb_schema::{Schema, SchemaAttr};
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use crate::{DocumentId, RankedMap};
|
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::RamDocumentStore;
|
use crate::serde::RamDocumentStore;
|
||||||
|
use crate::{DocumentId, RankedMap};
|
||||||
|
|
||||||
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
|
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
|
||||||
|
|
||||||
pub struct Serializer<'a> {
|
pub struct Serializer<'a> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
@ -55,13 +55,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "Option" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "Option",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -69,25 +74,29 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit struct" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "unit struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_unit_variant(
|
fn serialize_unit_variant(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str
|
_variant: &'static str,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "unit variant" })
|
type_name: "unit variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_newtype_struct<T: ?Sized>(
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
value: &T
|
value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
value.serialize(self)
|
value.serialize(self)
|
||||||
}
|
}
|
||||||
@ -97,15 +106,20 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_value: &T
|
_value: &T,
|
||||||
) -> Result<Self::Ok, Self::Error>
|
) -> Result<Self::Ok, Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "newtype variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
Err(SerializerError::UnserializableType { type_name: "sequence" })
|
Err(SerializerError::UnserializableType {
|
||||||
|
type_name: "sequence",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
@ -115,10 +129,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
fn serialize_tuple_struct(
|
fn serialize_tuple_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
) -> Result<Self::SerializeTupleStruct, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
|
type_name: "tuple struct",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_tuple_variant(
|
fn serialize_tuple_variant(
|
||||||
@ -126,10 +141,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
) -> Result<Self::SerializeTupleVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
|
type_name: "tuple variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
@ -147,9 +163,8 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
fn serialize_struct(
|
fn serialize_struct(
|
||||||
self,
|
self,
|
||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStruct, Self::Error>
|
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||||
{
|
|
||||||
Ok(StructSerializer {
|
Ok(StructSerializer {
|
||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
@ -165,10 +180,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
_name: &'static str,
|
_name: &'static str,
|
||||||
_variant_index: u32,
|
_variant_index: u32,
|
||||||
_variant: &'static str,
|
_variant: &'static str,
|
||||||
_len: usize
|
_len: usize,
|
||||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
) -> Result<Self::SerializeStructVariant, Self::Error> {
|
||||||
{
|
Err(SerializerError::UnserializableType {
|
||||||
Err(SerializerError::UnserializableType { type_name: "struct variant" })
|
type_name: "struct variant",
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,7 +203,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
|||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let key = key.serialize(ConvertToString)?;
|
let key = key.serialize(ConvertToString)?;
|
||||||
self.current_key_name = Some(key);
|
self.current_key_name = Some(key);
|
||||||
@ -195,7 +212,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
let key = self.current_key_name.take().unwrap();
|
let key = self.current_key_name.take().unwrap();
|
||||||
self.serialize_entry(&key, value)
|
self.serialize_entry(&key, value)
|
||||||
@ -206,7 +224,9 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
|||||||
key: &K,
|
key: &K,
|
||||||
value: &V,
|
value: &V,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where K: ser::Serialize, V: ser::Serialize,
|
where
|
||||||
|
K: ser::Serialize,
|
||||||
|
V: ser::Serialize,
|
||||||
{
|
{
|
||||||
let key = key.serialize(ConvertToString)?;
|
let key = key.serialize(ConvertToString)?;
|
||||||
|
|
||||||
@ -245,7 +265,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
key: &'static str,
|
key: &'static str,
|
||||||
value: &T,
|
value: &T,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
serialize_value(
|
serialize_value(
|
||||||
self.schema,
|
self.schema,
|
||||||
@ -274,7 +295,8 @@ fn serialize_value<T: ?Sized>(
|
|||||||
key: &str,
|
key: &str,
|
||||||
value: &T,
|
value: &T,
|
||||||
) -> Result<(), SerializerError>
|
) -> Result<(), SerializerError>
|
||||||
where T: ser::Serialize,
|
where
|
||||||
|
T: ser::Serialize,
|
||||||
{
|
{
|
||||||
if let Some(attribute) = schema.attribute(key) {
|
if let Some(attribute) = schema.attribute(key) {
|
||||||
let props = schema.props(attribute);
|
let props = schema.props(attribute);
|
||||||
@ -283,7 +305,11 @@ where T: ser::Serialize,
|
|||||||
document_store.set_document_field(document_id, attribute, serialized);
|
document_store.set_document_field(document_id, attribute, serialized);
|
||||||
|
|
||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let indexer = Indexer { attribute, indexer, document_id };
|
let indexer = Indexer {
|
||||||
|
attribute,
|
||||||
|
indexer,
|
||||||
|
document_id,
|
||||||
|
};
|
||||||
if let Some(number_of_words) = value.serialize(indexer)? {
|
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||||
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use zlmdb::types::{OwnedType, ByteSlice};
|
|
||||||
use zlmdb::Result as ZResult;
|
|
||||||
use crate::DocumentId;
|
|
||||||
use super::BEU64;
|
use super::BEU64;
|
||||||
|
use crate::DocumentId;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use zlmdb::types::{ByteSlice, OwnedType};
|
||||||
|
use zlmdb::Result as ZResult;
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocsWords {
|
pub struct DocsWords {
|
||||||
@ -15,8 +15,7 @@ impl DocsWords {
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
words: &fst::Set,
|
words: &fst::Set,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU64::new(document_id.0);
|
||||||
let bytes = words.as_fst().as_bytes();
|
let bytes = words.as_fst().as_bytes();
|
||||||
self.docs_words.put(writer, &document_id, bytes)
|
self.docs_words.put(writer, &document_id, bytes)
|
||||||
@ -26,8 +25,7 @@ impl DocsWords {
|
|||||||
&self,
|
&self,
|
||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<bool>
|
) -> ZResult<bool> {
|
||||||
{
|
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU64::new(document_id.0);
|
||||||
self.docs_words.delete(writer, &document_id)
|
self.docs_words.delete(writer, &document_id)
|
||||||
}
|
}
|
||||||
@ -36,8 +34,7 @@ impl DocsWords {
|
|||||||
&self,
|
&self,
|
||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<Option<fst::Set>>
|
) -> ZResult<Option<fst::Set>> {
|
||||||
{
|
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU64::new(document_id.0);
|
||||||
match self.docs_words.get(reader, &document_id)? {
|
match self.docs_words.get(reader, &document_id)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => {
|
||||||
@ -45,7 +42,7 @@ impl DocsWords {
|
|||||||
let bytes = Arc::from(bytes);
|
let bytes = Arc::from(bytes);
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
Ok(Some(fst::Set::from(fst)))
|
Ok(Some(fst::Set::from(fst)))
|
||||||
},
|
}
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use zlmdb::types::{OwnedType, ByteSlice};
|
use zlmdb::types::{ByteSlice, OwnedType};
|
||||||
use zlmdb::Result as ZResult;
|
use zlmdb::Result as ZResult;
|
||||||
|
|
||||||
use crate::DocumentId;
|
|
||||||
use super::DocumentAttrKey;
|
use super::DocumentAttrKey;
|
||||||
|
use crate::DocumentId;
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocumentsFields {
|
pub struct DocumentsFields {
|
||||||
@ -17,8 +17,7 @@ impl DocumentsFields {
|
|||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
attribute: SchemaAttr,
|
attribute: SchemaAttr,
|
||||||
value: &[u8],
|
value: &[u8],
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
let key = DocumentAttrKey::new(document_id, attribute);
|
let key = DocumentAttrKey::new(document_id, attribute);
|
||||||
self.documents_fields.put(writer, &key, value)
|
self.documents_fields.put(writer, &key, value)
|
||||||
}
|
}
|
||||||
@ -27,8 +26,7 @@ impl DocumentsFields {
|
|||||||
&self,
|
&self,
|
||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<usize>
|
) -> ZResult<usize> {
|
||||||
{
|
|
||||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||||
self.documents_fields.delete_range(writer, start..=end)
|
self.documents_fields.delete_range(writer, start..=end)
|
||||||
@ -39,8 +37,7 @@ impl DocumentsFields {
|
|||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
attribute: SchemaAttr,
|
attribute: SchemaAttr,
|
||||||
) -> ZResult<Option<&'txn [u8]>>
|
) -> ZResult<Option<&'txn [u8]>> {
|
||||||
{
|
|
||||||
let key = DocumentAttrKey::new(document_id, attribute);
|
let key = DocumentAttrKey::new(document_id, attribute);
|
||||||
self.documents_fields.get(reader, &key)
|
self.documents_fields.get(reader, &key)
|
||||||
}
|
}
|
||||||
@ -49,8 +46,7 @@ impl DocumentsFields {
|
|||||||
&self,
|
&self,
|
||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<DocumentFieldsIter<'txn>>
|
) -> ZResult<DocumentFieldsIter<'txn>> {
|
||||||
{
|
|
||||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||||
let iter = self.documents_fields.range(reader, start..=end)?;
|
let iter = self.documents_fields.range(reader, start..=end)?;
|
||||||
@ -70,7 +66,7 @@ impl<'txn> Iterator for DocumentFieldsIter<'txn> {
|
|||||||
Some(Ok((key, bytes))) => {
|
Some(Ok((key, bytes))) => {
|
||||||
let attr = SchemaAttr(key.attr.get());
|
let attr = SchemaAttr(key.attr.get());
|
||||||
Some(Ok((attr, bytes)))
|
Some(Ok((attr, bytes)))
|
||||||
},
|
}
|
||||||
Some(Err(e)) => Some(Err(e.into())),
|
Some(Err(e)) => Some(Err(e.into())),
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
|
use super::DocumentAttrKey;
|
||||||
|
use crate::DocumentId;
|
||||||
use meilidb_schema::SchemaAttr;
|
use meilidb_schema::SchemaAttr;
|
||||||
use zlmdb::types::OwnedType;
|
use zlmdb::types::OwnedType;
|
||||||
use zlmdb::Result as ZResult;
|
use zlmdb::Result as ZResult;
|
||||||
use crate::DocumentId;
|
|
||||||
use super::DocumentAttrKey;
|
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocumentsFieldsCounts {
|
pub struct DocumentsFieldsCounts {
|
||||||
@ -16,8 +16,7 @@ impl DocumentsFieldsCounts {
|
|||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
attribute: SchemaAttr,
|
attribute: SchemaAttr,
|
||||||
value: u64,
|
value: u64,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
let key = DocumentAttrKey::new(document_id, attribute);
|
let key = DocumentAttrKey::new(document_id, attribute);
|
||||||
self.documents_fields_counts.put(writer, &key, &value)
|
self.documents_fields_counts.put(writer, &key, &value)
|
||||||
}
|
}
|
||||||
@ -26,11 +25,11 @@ impl DocumentsFieldsCounts {
|
|||||||
&self,
|
&self,
|
||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<usize>
|
) -> ZResult<usize> {
|
||||||
{
|
|
||||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||||
self.documents_fields_counts.delete_range(writer, start..=end)
|
self.documents_fields_counts
|
||||||
|
.delete_range(writer, start..=end)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn document_field_count(
|
pub fn document_field_count(
|
||||||
@ -38,8 +37,7 @@ impl DocumentsFieldsCounts {
|
|||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
attribute: SchemaAttr,
|
attribute: SchemaAttr,
|
||||||
) -> ZResult<Option<u64>>
|
) -> ZResult<Option<u64>> {
|
||||||
{
|
|
||||||
let key = DocumentAttrKey::new(document_id, attribute);
|
let key = DocumentAttrKey::new(document_id, attribute);
|
||||||
match self.documents_fields_counts.get(reader, &key)? {
|
match self.documents_fields_counts.get(reader, &key)? {
|
||||||
Some(count) => Ok(Some(count)),
|
Some(count) => Ok(Some(count)),
|
||||||
@ -51,8 +49,7 @@ impl DocumentsFieldsCounts {
|
|||||||
&self,
|
&self,
|
||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<DocumentFieldsCountsIter<'txn>>
|
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
|
||||||
{
|
|
||||||
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
|
||||||
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
|
||||||
let iter = self.documents_fields_counts.range(reader, start..=end)?;
|
let iter = self.documents_fields_counts.range(reader, start..=end)?;
|
||||||
@ -62,17 +59,18 @@ impl DocumentsFieldsCounts {
|
|||||||
pub fn documents_ids<'txn>(
|
pub fn documents_ids<'txn>(
|
||||||
&self,
|
&self,
|
||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
) -> ZResult<DocumentsIdsIter<'txn>>
|
) -> ZResult<DocumentsIdsIter<'txn>> {
|
||||||
{
|
|
||||||
let iter = self.documents_fields_counts.iter(reader)?;
|
let iter = self.documents_fields_counts.iter(reader)?;
|
||||||
Ok(DocumentsIdsIter { last_seen_id: None, iter })
|
Ok(DocumentsIdsIter {
|
||||||
|
last_seen_id: None,
|
||||||
|
iter,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn all_documents_fields_counts<'txn>(
|
pub fn all_documents_fields_counts<'txn>(
|
||||||
&self,
|
&self,
|
||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>>
|
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
|
||||||
{
|
|
||||||
let iter = self.documents_fields_counts.iter(reader)?;
|
let iter = self.documents_fields_counts.iter(reader)?;
|
||||||
Ok(AllDocumentsFieldsCountsIter { iter })
|
Ok(AllDocumentsFieldsCountsIter { iter })
|
||||||
}
|
}
|
||||||
@ -90,7 +88,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
|
|||||||
Some(Ok((key, count))) => {
|
Some(Ok((key, count))) => {
|
||||||
let attr = SchemaAttr(key.attr.get());
|
let attr = SchemaAttr(key.attr.get());
|
||||||
Some(Ok((attr, count)))
|
Some(Ok((attr, count)))
|
||||||
},
|
}
|
||||||
Some(Err(e)) => Some(Err(e.into())),
|
Some(Err(e)) => Some(Err(e.into())),
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
@ -112,9 +110,9 @@ impl Iterator for DocumentsIdsIter<'_> {
|
|||||||
let document_id = DocumentId(key.docid.get());
|
let document_id = DocumentId(key.docid.get());
|
||||||
if Some(document_id) != self.last_seen_id {
|
if Some(document_id) != self.last_seen_id {
|
||||||
self.last_seen_id = Some(document_id);
|
self.last_seen_id = Some(document_id);
|
||||||
return Some(Ok(document_id))
|
return Some(Ok(document_id));
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
Err(e) => return Some(Err(e.into())),
|
Err(e) => return Some(Err(e.into())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -135,7 +133,7 @@ impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
|
|||||||
let docid = DocumentId(key.docid.get());
|
let docid = DocumentId(key.docid.get());
|
||||||
let attr = SchemaAttr(key.attr.get());
|
let attr = SchemaAttr(key.attr.get());
|
||||||
Some(Ok((docid, attr, count)))
|
Some(Ok((docid, attr, count)))
|
||||||
},
|
}
|
||||||
Some(Err(e)) => Some(Err(e.into())),
|
Some(Err(e)) => Some(Err(e.into())),
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use meilidb_schema::Schema;
|
|
||||||
use zlmdb::types::{Str, OwnedType, ByteSlice, Serde};
|
|
||||||
use zlmdb::Result as ZResult;
|
|
||||||
use crate::RankedMap;
|
use crate::RankedMap;
|
||||||
|
use meilidb_schema::Schema;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
|
||||||
|
use zlmdb::Result as ZResult;
|
||||||
|
|
||||||
const CUSTOMS_KEY: &str = "customs-key";
|
const CUSTOMS_KEY: &str = "customs-key";
|
||||||
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
||||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||||
const SCHEMA_KEY: &str = "schema";
|
const SCHEMA_KEY: &str = "schema";
|
||||||
const SYNONYMS_KEY: &str = "synonyms";
|
const SYNONYMS_KEY: &str = "synonyms";
|
||||||
const WORDS_KEY: &str = "words";
|
const WORDS_KEY: &str = "words";
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct Main {
|
pub struct Main {
|
||||||
@ -29,13 +29,14 @@ impl Main {
|
|||||||
let bytes = Arc::from(bytes);
|
let bytes = Arc::from(bytes);
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
Ok(Some(fst::Set::from(fst)))
|
Ok(Some(fst::Set::from(fst)))
|
||||||
},
|
}
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
|
pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
|
||||||
self.main.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
|
self.main
|
||||||
|
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
|
pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
|
||||||
@ -43,11 +44,13 @@ impl Main {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
|
pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
|
||||||
self.main.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
|
self.main
|
||||||
|
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
|
pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
|
||||||
self.main.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
|
self.main
|
||||||
|
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
|
||||||
@ -62,28 +65,34 @@ impl Main {
|
|||||||
let bytes = Arc::from(bytes);
|
let bytes = Arc::from(bytes);
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
Ok(Some(fst::Set::from(fst)))
|
Ok(Some(fst::Set::from(fst)))
|
||||||
},
|
}
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_number_of_documents<F>(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
|
pub fn put_number_of_documents<F>(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
|
||||||
where F: Fn(u64) -> u64,
|
where
|
||||||
|
F: Fn(u64) -> u64,
|
||||||
{
|
{
|
||||||
let new = self.number_of_documents(writer).map(f)?;
|
let new = self.number_of_documents(writer).map(f)?;
|
||||||
self.main.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
|
self.main
|
||||||
|
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
|
||||||
Ok(new)
|
Ok(new)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
|
pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
|
||||||
match self.main.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? {
|
match self
|
||||||
|
.main
|
||||||
|
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
|
||||||
|
{
|
||||||
Some(value) => Ok(value),
|
Some(value) => Ok(value),
|
||||||
None => Ok(0),
|
None => Ok(0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
|
pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
|
||||||
self.main.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
|
self.main
|
||||||
|
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
|
pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
|
||||||
|
@ -8,8 +8,10 @@ mod updates;
|
|||||||
mod updates_results;
|
mod updates_results;
|
||||||
|
|
||||||
pub use self::docs_words::DocsWords;
|
pub use self::docs_words::DocsWords;
|
||||||
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
|
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||||
pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter};
|
pub use self::documents_fields_counts::{
|
||||||
|
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
||||||
|
};
|
||||||
pub use self::main::Main;
|
pub use self::main::Main;
|
||||||
pub use self::postings_lists::PostingsLists;
|
pub use self::postings_lists::PostingsLists;
|
||||||
pub use self::synonyms::Synonyms;
|
pub use self::synonyms::Synonyms;
|
||||||
@ -25,19 +27,24 @@ use zlmdb::Result as ZResult;
|
|||||||
|
|
||||||
use crate::criterion::Criteria;
|
use crate::criterion::Criteria;
|
||||||
use crate::serde::Deserializer;
|
use crate::serde::Deserializer;
|
||||||
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
||||||
|
|
||||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||||
#[derive(AsBytes, FromBytes)]
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocumentAttrKey { docid: BEU64, attr: BEU16 }
|
pub struct DocumentAttrKey {
|
||||||
|
docid: BEU64,
|
||||||
|
attr: BEU16,
|
||||||
|
}
|
||||||
|
|
||||||
impl DocumentAttrKey {
|
impl DocumentAttrKey {
|
||||||
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
|
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
|
||||||
DocumentAttrKey { docid: BEU64::new(docid.0), attr: BEU16::new(attr.0) }
|
DocumentAttrKey {
|
||||||
|
docid: BEU64::new(docid.0),
|
||||||
|
attr: BEU16::new(attr.0),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,13 +100,15 @@ impl Index {
|
|||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
attributes: Option<&HashSet<&str>>,
|
attributes: Option<&HashSet<&str>>,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> MResult<Option<T>>
|
) -> MResult<Option<T>> {
|
||||||
{
|
|
||||||
let schema = self.main.schema(reader)?;
|
let schema = self.main.schema(reader)?;
|
||||||
let schema = schema.ok_or(Error::SchemaMissing)?;
|
let schema = schema.ok_or(Error::SchemaMissing)?;
|
||||||
|
|
||||||
let attributes = match attributes {
|
let attributes = match attributes {
|
||||||
Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(),
|
Some(attributes) => attributes
|
||||||
|
.into_iter()
|
||||||
|
.map(|name| schema.attribute(name))
|
||||||
|
.collect(),
|
||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -121,9 +130,10 @@ impl Index {
|
|||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
attribute: SchemaAttr,
|
attribute: SchemaAttr,
|
||||||
) -> MResult<Option<T>>
|
) -> MResult<Option<T>> {
|
||||||
{
|
let bytes = self
|
||||||
let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?;
|
.documents_fields
|
||||||
|
.document_attribute(reader, document_id, attribute)?;
|
||||||
match bytes {
|
match bytes {
|
||||||
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
|
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
@ -183,14 +193,8 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
) -> MResult<update::UpdateStatus>
|
) -> MResult<update::UpdateStatus> {
|
||||||
{
|
update::update_status(reader, self.updates, self.updates_results, update_id)
|
||||||
update::update_status(
|
|
||||||
reader,
|
|
||||||
self.updates,
|
|
||||||
self.updates_results,
|
|
||||||
update_id,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder(&self) -> QueryBuilder {
|
pub fn query_builder(&self) -> QueryBuilder {
|
||||||
@ -205,8 +209,7 @@ impl Index {
|
|||||||
pub fn query_builder_with_criteria<'c, 'f, 'd>(
|
pub fn query_builder_with_criteria<'c, 'f, 'd>(
|
||||||
&self,
|
&self,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
) -> QueryBuilder<'c, 'f, 'd>
|
) -> QueryBuilder<'c, 'f, 'd> {
|
||||||
{
|
|
||||||
QueryBuilder::with_criteria(
|
QueryBuilder::with_criteria(
|
||||||
self.main,
|
self.main,
|
||||||
self.postings_lists,
|
self.postings_lists,
|
||||||
@ -221,8 +224,7 @@ pub fn create(
|
|||||||
env: &zlmdb::Env,
|
env: &zlmdb::Env,
|
||||||
name: &str,
|
name: &str,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> MResult<Index>
|
) -> MResult<Index> {
|
||||||
{
|
|
||||||
// create all the store names
|
// create all the store names
|
||||||
let main_name = main_name(name);
|
let main_name = main_name(name);
|
||||||
let postings_lists_name = postings_lists_name(name);
|
let postings_lists_name = postings_lists_name(name);
|
||||||
@ -247,7 +249,9 @@ pub fn create(
|
|||||||
main: Main { main },
|
main: Main { main },
|
||||||
postings_lists: PostingsLists { postings_lists },
|
postings_lists: PostingsLists { postings_lists },
|
||||||
documents_fields: DocumentsFields { documents_fields },
|
documents_fields: DocumentsFields { documents_fields },
|
||||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
documents_fields_counts: DocumentsFieldsCounts {
|
||||||
|
documents_fields_counts,
|
||||||
|
},
|
||||||
synonyms: Synonyms { synonyms },
|
synonyms: Synonyms { synonyms },
|
||||||
docs_words: DocsWords { docs_words },
|
docs_words: DocsWords { docs_words },
|
||||||
updates: Updates { updates },
|
updates: Updates { updates },
|
||||||
@ -260,8 +264,7 @@ pub fn open(
|
|||||||
env: &zlmdb::Env,
|
env: &zlmdb::Env,
|
||||||
name: &str,
|
name: &str,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> MResult<Option<Index>>
|
) -> MResult<Option<Index>> {
|
||||||
{
|
|
||||||
// create all the store names
|
// create all the store names
|
||||||
let main_name = main_name(name);
|
let main_name = main_name(name);
|
||||||
let postings_lists_name = postings_lists_name(name);
|
let postings_lists_name = postings_lists_name(name);
|
||||||
@ -310,7 +313,9 @@ pub fn open(
|
|||||||
main: Main { main },
|
main: Main { main },
|
||||||
postings_lists: PostingsLists { postings_lists },
|
postings_lists: PostingsLists { postings_lists },
|
||||||
documents_fields: DocumentsFields { documents_fields },
|
documents_fields: DocumentsFields { documents_fields },
|
||||||
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
documents_fields_counts: DocumentsFieldsCounts {
|
||||||
|
documents_fields_counts,
|
||||||
|
},
|
||||||
synonyms: Synonyms { synonyms },
|
synonyms: Synonyms { synonyms },
|
||||||
docs_words: DocsWords { docs_words },
|
docs_words: DocsWords { docs_words },
|
||||||
updates: Updates { updates },
|
updates: Updates { updates },
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::borrow::Cow;
|
use crate::DocIndex;
|
||||||
use sdset::{Set, SetBuf};
|
use sdset::{Set, SetBuf};
|
||||||
|
use std::borrow::Cow;
|
||||||
use zlmdb::types::{ByteSlice, CowSlice};
|
use zlmdb::types::{ByteSlice, CowSlice};
|
||||||
use zlmdb::Result as ZResult;
|
use zlmdb::Result as ZResult;
|
||||||
use crate::DocIndex;
|
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct PostingsLists {
|
pub struct PostingsLists {
|
||||||
@ -15,8 +15,7 @@ impl PostingsLists {
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
words_indexes: &Set<DocIndex>,
|
words_indexes: &Set<DocIndex>,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
self.postings_lists.put(writer, word, words_indexes)
|
self.postings_lists.put(writer, word, words_indexes)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -28,8 +27,7 @@ impl PostingsLists {
|
|||||||
&self,
|
&self,
|
||||||
reader: &'txn zlmdb::RoTxn,
|
reader: &'txn zlmdb::RoTxn,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>>
|
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
||||||
{
|
|
||||||
match self.postings_lists.get(reader, word)? {
|
match self.postings_lists.get(reader, word)? {
|
||||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
||||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
||||||
|
@ -13,8 +13,7 @@ impl Synonyms {
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
synonyms: &fst::Set,
|
synonyms: &fst::Set,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
let bytes = synonyms.as_fst().as_bytes();
|
let bytes = synonyms.as_fst().as_bytes();
|
||||||
self.synonyms.put(writer, word, bytes)
|
self.synonyms.put(writer, word, bytes)
|
||||||
}
|
}
|
||||||
@ -30,7 +29,7 @@ impl Synonyms {
|
|||||||
let bytes = Arc::from(bytes);
|
let bytes = Arc::from(bytes);
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
Ok(Some(fst::Set::from(fst)))
|
Ok(Some(fst::Set::from(fst)))
|
||||||
},
|
}
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
|
use super::BEU64;
|
||||||
|
use crate::update::Update;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use zlmdb::types::OwnedType;
|
use zlmdb::types::OwnedType;
|
||||||
use zlmdb::{Result as ZResult, BytesEncode, BytesDecode};
|
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
|
||||||
use serde::{Serialize, Deserialize};
|
|
||||||
use crate::update::Update;
|
|
||||||
use super::BEU64;
|
|
||||||
|
|
||||||
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
|
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
|
||||||
|
|
||||||
impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
|
impl<T> BytesEncode for SerdeJson<T>
|
||||||
|
where
|
||||||
|
T: Serialize,
|
||||||
|
{
|
||||||
type EItem = T;
|
type EItem = T;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
@ -15,7 +18,10 @@ impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T> where T: Deserialize<'a> + Clone {
|
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
|
||||||
|
where
|
||||||
|
T: Deserialize<'a> + Clone,
|
||||||
|
{
|
||||||
type DItem = T;
|
type DItem = T;
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
@ -56,8 +62,7 @@ impl Updates {
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
update: &Update,
|
update: &Update,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
// TODO prefer using serde_json?
|
// TODO prefer using serde_json?
|
||||||
let update_id = BEU64::new(update_id);
|
let update_id = BEU64::new(update_id);
|
||||||
self.updates.put(writer, &update_id, update)
|
self.updates.put(writer, &update_id, update)
|
||||||
@ -69,8 +74,8 @@ impl Updates {
|
|||||||
let key = BEU64::new(update_id);
|
let key = BEU64::new(update_id);
|
||||||
self.updates.delete(writer, &key)?;
|
self.updates.delete(writer, &key)?;
|
||||||
Ok(Some((update_id, update)))
|
Ok(Some((update_id, update)))
|
||||||
},
|
}
|
||||||
None => Ok(None)
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
|
use super::BEU64;
|
||||||
|
use crate::update::UpdateResult;
|
||||||
use zlmdb::types::{OwnedType, Serde};
|
use zlmdb::types::{OwnedType, Serde};
|
||||||
use zlmdb::Result as ZResult;
|
use zlmdb::Result as ZResult;
|
||||||
use crate::update::UpdateResult;
|
|
||||||
use super::BEU64;
|
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct UpdatesResults {
|
pub struct UpdatesResults {
|
||||||
@ -21,8 +21,7 @@ impl UpdatesResults {
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
update_result: &UpdateResult,
|
update_result: &UpdateResult,
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
let update_id = BEU64::new(update_id);
|
let update_id = BEU64::new(update_id);
|
||||||
self.updates_results.put(writer, &update_id, update_result)
|
self.updates_results.put(writer, &update_id, update_result)
|
||||||
}
|
}
|
||||||
@ -31,8 +30,7 @@ impl UpdatesResults {
|
|||||||
&self,
|
&self,
|
||||||
reader: &zlmdb::RoTxn,
|
reader: &zlmdb::RoTxn,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
) -> ZResult<Option<UpdateResult>>
|
) -> ZResult<Option<UpdateResult>> {
|
||||||
{
|
|
||||||
let update_id = BEU64::new(update_id);
|
let update_id = BEU64::new(update_id);
|
||||||
self.updates_results.get(reader, &update_id)
|
self.updates_results.get(reader, &update_id)
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
use zlmdb::Result as ZResult;
|
|
||||||
use crate::update::{Update, next_update_id};
|
|
||||||
use crate::store;
|
use crate::store;
|
||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use zlmdb::Result as ZResult;
|
||||||
|
|
||||||
pub fn apply_customs_update(
|
pub fn apply_customs_update(
|
||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
customs: &[u8],
|
customs: &[u8],
|
||||||
) -> ZResult<()>
|
) -> ZResult<()> {
|
||||||
{
|
|
||||||
main_store.put_customs(writer, customs)
|
main_store.put_customs(writer, customs)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -16,8 +15,7 @@ pub fn push_customs_update(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
customs: Vec<u8>,
|
customs: Vec<u8>,
|
||||||
) -> ZResult<u64>
|
) -> ZResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::Customs(customs);
|
let update = Update::Customs(customs);
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
use sdset::{SetOperation, duo::Union};
|
use sdset::{duo::Union, SetOperation};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
|
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
|
||||||
use crate::store;
|
use crate::store;
|
||||||
use crate::update::{Update, next_update_id, apply_documents_deletion};
|
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
||||||
use crate::{MResult, Error, RankedMap};
|
use crate::{Error, MResult, RankedMap};
|
||||||
|
|
||||||
pub struct DocumentsAddition<D> {
|
pub struct DocumentsAddition<D> {
|
||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
@ -22,8 +22,7 @@ impl<D> DocumentsAddition<D> {
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> DocumentsAddition<D>
|
) -> DocumentsAddition<D> {
|
||||||
{
|
|
||||||
DocumentsAddition {
|
DocumentsAddition {
|
||||||
updates_store,
|
updates_store,
|
||||||
updates_results_store,
|
updates_results_store,
|
||||||
@ -37,7 +36,8 @@ impl<D> DocumentsAddition<D> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
|
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
|
||||||
where D: serde::Serialize
|
where
|
||||||
|
D: serde::Serialize,
|
||||||
{
|
{
|
||||||
let _ = self.updates_notifier.send(());
|
let _ = self.updates_notifier.send(());
|
||||||
let update_id = push_documents_addition(
|
let update_id = push_documents_addition(
|
||||||
@ -51,7 +51,7 @@ impl<D> DocumentsAddition<D> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<D> Extend<D> for DocumentsAddition<D> {
|
impl<D> Extend<D> for DocumentsAddition<D> {
|
||||||
fn extend<T: IntoIterator<Item=D>>(&mut self, iter: T) {
|
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
|
||||||
self.documents.extend(iter)
|
self.documents.extend(iter)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -61,8 +61,7 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
addition: Vec<D>,
|
addition: Vec<D>,
|
||||||
) -> MResult<u64>
|
) -> MResult<u64> {
|
||||||
{
|
|
||||||
let mut values = Vec::with_capacity(addition.len());
|
let mut values = Vec::with_capacity(addition.len());
|
||||||
for add in addition {
|
for add in addition {
|
||||||
let vec = serde_json::to_vec(&add)?;
|
let vec = serde_json::to_vec(&add)?;
|
||||||
@ -87,8 +86,7 @@ pub fn apply_documents_addition(
|
|||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
addition: Vec<serde_json::Value>,
|
addition: Vec<serde_json::Value>,
|
||||||
) -> MResult<()>
|
) -> MResult<()> {
|
||||||
{
|
|
||||||
let mut document_ids = HashSet::new();
|
let mut document_ids = HashSet::new();
|
||||||
let mut document_store = RamDocumentStore::new();
|
let mut document_store = RamDocumentStore::new();
|
||||||
let mut document_fields_counts = HashMap::new();
|
let mut document_fields_counts = HashMap::new();
|
||||||
@ -182,7 +180,7 @@ pub fn apply_documents_addition(
|
|||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
}
|
||||||
None => delta_words,
|
None => delta_words,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
use std::collections::{HashMap, HashSet, BTreeSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
|
|
||||||
use fst::{SetBuilder, Streamer};
|
use fst::{SetBuilder, Streamer};
|
||||||
use meilidb_schema::Schema;
|
use meilidb_schema::Schema;
|
||||||
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
|
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
|
||||||
|
|
||||||
use crate::{DocumentId, RankedMap, MResult, Error};
|
|
||||||
use crate::serde::extract_document_id;
|
use crate::serde::extract_document_id;
|
||||||
use crate::update::{Update, next_update_id};
|
|
||||||
use crate::store;
|
use crate::store;
|
||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use crate::{DocumentId, Error, MResult, RankedMap};
|
||||||
|
|
||||||
pub struct DocumentsDeletion {
|
pub struct DocumentsDeletion {
|
||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
@ -21,8 +21,7 @@ impl DocumentsDeletion {
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> DocumentsDeletion
|
) -> DocumentsDeletion {
|
||||||
{
|
|
||||||
DocumentsDeletion {
|
DocumentsDeletion {
|
||||||
updates_store,
|
updates_store,
|
||||||
updates_results_store,
|
updates_results_store,
|
||||||
@ -36,7 +35,8 @@ impl DocumentsDeletion {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
|
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
|
||||||
where D: serde::Serialize,
|
where
|
||||||
|
D: serde::Serialize,
|
||||||
{
|
{
|
||||||
let identifier = schema.identifier_name();
|
let identifier = schema.identifier_name();
|
||||||
let document_id = match extract_document_id(identifier, &document)? {
|
let document_id = match extract_document_id(identifier, &document)? {
|
||||||
@ -62,7 +62,7 @@ impl DocumentsDeletion {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Extend<DocumentId> for DocumentsDeletion {
|
impl Extend<DocumentId> for DocumentsDeletion {
|
||||||
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
|
fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
|
||||||
self.documents.extend(iter)
|
self.documents.extend(iter)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -72,8 +72,7 @@ pub fn push_documents_deletion(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
deletion: Vec<DocumentId>,
|
deletion: Vec<DocumentId>,
|
||||||
) -> MResult<u64>
|
) -> MResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::DocumentsDeletion(deletion);
|
let update = Update::DocumentsDeletion(deletion);
|
||||||
@ -91,8 +90,7 @@ pub fn apply_documents_deletion(
|
|||||||
docs_words_store: store::DocsWords,
|
docs_words_store: store::DocsWords,
|
||||||
mut ranked_map: RankedMap,
|
mut ranked_map: RankedMap,
|
||||||
deletion: Vec<DocumentId>,
|
deletion: Vec<DocumentId>,
|
||||||
) -> MResult<()>
|
) -> MResult<()> {
|
||||||
{
|
|
||||||
let idset = SetBuf::from_dirty(deletion);
|
let idset = SetBuf::from_dirty(deletion);
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match main_store.schema(writer)? {
|
||||||
@ -101,10 +99,17 @@ pub fn apply_documents_deletion(
|
|||||||
};
|
};
|
||||||
|
|
||||||
// collect the ranked attributes according to the schema
|
// collect the ranked attributes according to the schema
|
||||||
let ranked_attrs: Vec<_> = schema.iter()
|
let ranked_attrs: Vec<_> = schema
|
||||||
.filter_map(|(_, attr, prop)| {
|
.iter()
|
||||||
if prop.is_ranked() { Some(attr) } else { None }
|
.filter_map(
|
||||||
})
|
|(_, attr, prop)| {
|
||||||
|
if prop.is_ranked() {
|
||||||
|
Some(attr)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mut words_document_ids = HashMap::new();
|
let mut words_document_ids = HashMap::new();
|
||||||
@ -118,7 +123,10 @@ pub fn apply_documents_deletion(
|
|||||||
let mut stream = words.stream();
|
let mut stream = words.stream();
|
||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = word.to_vec();
|
let word = word.to_vec();
|
||||||
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
|
words_document_ids
|
||||||
|
.entry(word)
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,7 +175,7 @@ pub fn apply_documents_deletion(
|
|||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
}
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6,21 +6,21 @@ mod synonyms_addition;
|
|||||||
mod synonyms_deletion;
|
mod synonyms_deletion;
|
||||||
|
|
||||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||||
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
|
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||||
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
|
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||||
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
pub use self::schema_update::{apply_schema_update, push_schema_update};
|
||||||
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
|
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
|
||||||
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};
|
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
|
||||||
|
|
||||||
use std::time::{Duration, Instant};
|
|
||||||
use std::collections::BTreeMap;
|
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use zlmdb::Result as ZResult;
|
use zlmdb::Result as ZResult;
|
||||||
|
|
||||||
use crate::{store, MResult, DocumentId, RankedMap};
|
use crate::{store, DocumentId, MResult, RankedMap};
|
||||||
use meilidb_schema::Schema;
|
use meilidb_schema::Schema;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -68,8 +68,7 @@ pub fn update_status(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
) -> MResult<UpdateStatus>
|
) -> MResult<UpdateStatus> {
|
||||||
{
|
|
||||||
match updates_results_store.update_result(reader, update_id)? {
|
match updates_results_store.update_result(reader, update_id)? {
|
||||||
Some(result) => Ok(UpdateStatus::Processed(result)),
|
Some(result) => Ok(UpdateStatus::Processed(result)),
|
||||||
None => {
|
None => {
|
||||||
@ -86,8 +85,7 @@ pub fn next_update_id(
|
|||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
) -> ZResult<u64>
|
) -> ZResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = updates_store.last_update_id(writer)?;
|
let last_update_id = updates_store.last_update_id(writer)?;
|
||||||
let last_update_id = last_update_id.map(|(n, _)| n);
|
let last_update_id = last_update_id.map(|(n, _)| n);
|
||||||
|
|
||||||
@ -100,7 +98,10 @@ pub fn next_update_id(
|
|||||||
Ok(new_update_id)
|
Ok(new_update_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Option<UpdateResult>> {
|
pub fn update_task(
|
||||||
|
writer: &mut zlmdb::RwTxn,
|
||||||
|
index: store::Index,
|
||||||
|
) -> MResult<Option<UpdateResult>> {
|
||||||
let (update_id, update) = match index.updates.pop_front(writer)? {
|
let (update_id, update) = match index.updates.pop_front(writer)? {
|
||||||
Some(value) => value,
|
Some(value) => value,
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
@ -112,11 +113,13 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
Update::Schema(schema) => {
|
Update::Schema(schema) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let update_type = UpdateType::Schema { schema: schema.clone() };
|
let update_type = UpdateType::Schema {
|
||||||
|
schema: schema.clone(),
|
||||||
|
};
|
||||||
let result = apply_schema_update(writer, index.main, &schema);
|
let result = apply_schema_update(writer, index.main, &schema);
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
},
|
}
|
||||||
Update::Customs(customs) => {
|
Update::Customs(customs) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
@ -133,7 +136,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
|
let update_type = UpdateType::DocumentsAddition {
|
||||||
|
number: documents.len(),
|
||||||
|
};
|
||||||
|
|
||||||
let result = apply_documents_addition(
|
let result = apply_documents_addition(
|
||||||
writer,
|
writer,
|
||||||
@ -147,7 +152,7 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
);
|
);
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
},
|
}
|
||||||
Update::DocumentsDeletion(documents) => {
|
Update::DocumentsDeletion(documents) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
@ -156,7 +161,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
|
let update_type = UpdateType::DocumentsDeletion {
|
||||||
|
number: documents.len(),
|
||||||
|
};
|
||||||
|
|
||||||
let result = apply_documents_deletion(
|
let result = apply_documents_deletion(
|
||||||
writer,
|
writer,
|
||||||
@ -170,38 +177,35 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
);
|
);
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
},
|
}
|
||||||
Update::SynonymsAddition(synonyms) => {
|
Update::SynonymsAddition(synonyms) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
|
let update_type = UpdateType::SynonymsAddition {
|
||||||
|
number: synonyms.len(),
|
||||||
|
};
|
||||||
|
|
||||||
let result = apply_synonyms_addition(
|
let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.synonyms,
|
|
||||||
synonyms,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
},
|
}
|
||||||
Update::SynonymsDeletion(synonyms) => {
|
Update::SynonymsDeletion(synonyms) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
|
let update_type = UpdateType::SynonymsDeletion {
|
||||||
|
number: synonyms.len(),
|
||||||
|
};
|
||||||
|
|
||||||
let result = apply_synonyms_deletion(
|
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.synonyms,
|
|
||||||
synonyms,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!("Processed update number {} {:?} {:?}", update_id, update_type, result);
|
debug!(
|
||||||
|
"Processed update number {} {:?} {:?}",
|
||||||
|
update_id, update_type, result
|
||||||
|
);
|
||||||
|
|
||||||
let detailed_duration = DetailedDuration { main: duration };
|
let detailed_duration = DetailedDuration { main: duration };
|
||||||
let status = UpdateResult {
|
let status = UpdateResult {
|
||||||
@ -211,7 +215,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
|
|||||||
detailed_duration,
|
detailed_duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
index.updates_results.put_update_result(writer, update_id, &status)?;
|
index
|
||||||
|
.updates_results
|
||||||
|
.put_update_result(writer, update_id, &status)?;
|
||||||
|
|
||||||
Ok(Some(status))
|
Ok(Some(status))
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
|
use crate::update::{next_update_id, Update};
|
||||||
|
use crate::{error::UnsupportedOperation, store, MResult};
|
||||||
use meilidb_schema::Schema;
|
use meilidb_schema::Schema;
|
||||||
use crate::{store, error::UnsupportedOperation, MResult};
|
|
||||||
use crate::update::{Update, next_update_id};
|
|
||||||
|
|
||||||
pub fn apply_schema_update(
|
pub fn apply_schema_update(
|
||||||
writer: &mut zlmdb::RwTxn,
|
writer: &mut zlmdb::RwTxn,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
new_schema: &Schema,
|
new_schema: &Schema,
|
||||||
) -> MResult<()>
|
) -> MResult<()> {
|
||||||
{
|
|
||||||
if let Some(_) = main_store.schema(writer)? {
|
if let Some(_) = main_store.schema(writer)? {
|
||||||
return Err(UnsupportedOperation::SchemaAlreadyExists.into())
|
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
main_store.put_schema(writer, new_schema).map_err(Into::into)
|
main_store
|
||||||
|
.put_schema(writer, new_schema)
|
||||||
|
.map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn push_schema_update(
|
pub fn push_schema_update(
|
||||||
@ -20,8 +21,7 @@ pub fn push_schema_update(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
) -> MResult<u64>
|
) -> MResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::Schema(schema);
|
let update = Update::Schema(schema);
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::automaton::normalize_str;
|
use crate::automaton::normalize_str;
|
||||||
use crate::update::{Update, next_update_id};
|
use crate::update::{next_update_id, Update};
|
||||||
use crate::{store, MResult};
|
use crate::{store, MResult};
|
||||||
|
|
||||||
pub struct SynonymsAddition {
|
pub struct SynonymsAddition {
|
||||||
@ -19,8 +19,7 @@ impl SynonymsAddition {
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> SynonymsAddition
|
) -> SynonymsAddition {
|
||||||
{
|
|
||||||
SynonymsAddition {
|
SynonymsAddition {
|
||||||
updates_store,
|
updates_store,
|
||||||
updates_results_store,
|
updates_results_store,
|
||||||
@ -30,13 +29,17 @@ impl SynonymsAddition {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||||
where S: AsRef<str>,
|
where
|
||||||
T: AsRef<str>,
|
S: AsRef<str>,
|
||||||
I: IntoIterator<Item=T>,
|
T: AsRef<str>,
|
||||||
|
I: IntoIterator<Item = T>,
|
||||||
{
|
{
|
||||||
let synonym = normalize_str(synonym.as_ref());
|
let synonym = normalize_str(synonym.as_ref());
|
||||||
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
|
||||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
self.synonyms
|
||||||
|
.entry(synonym)
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.extend(alternatives);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
|
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
|
||||||
@ -56,8 +59,7 @@ pub fn push_synonyms_addition(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
addition: BTreeMap<String, Vec<String>>,
|
addition: BTreeMap<String, Vec<String>>,
|
||||||
) -> MResult<u64>
|
) -> MResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::SynonymsAddition(addition);
|
let update = Update::SynonymsAddition(addition);
|
||||||
@ -71,8 +73,7 @@ pub fn apply_synonyms_addition(
|
|||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
addition: BTreeMap<String, Vec<String>>,
|
addition: BTreeMap<String, Vec<String>>,
|
||||||
) -> MResult<()>
|
) -> MResult<()> {
|
||||||
{
|
|
||||||
let mut synonyms_builder = SetBuilder::memory();
|
let mut synonyms_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for (word, alternatives) in addition {
|
for (word, alternatives) in addition {
|
||||||
@ -107,7 +108,7 @@ pub fn apply_synonyms_addition(
|
|||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
}
|
||||||
None => delta_synonyms,
|
None => delta_synonyms,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::automaton::normalize_str;
|
use crate::automaton::normalize_str;
|
||||||
use crate::update::{Update, next_update_id};
|
use crate::update::{next_update_id, Update};
|
||||||
use crate::{store, MResult};
|
use crate::{store, MResult};
|
||||||
|
|
||||||
pub struct SynonymsDeletion {
|
pub struct SynonymsDeletion {
|
||||||
@ -20,8 +20,7 @@ impl SynonymsDeletion {
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
updates_notifier: crossbeam_channel::Sender<()>,
|
updates_notifier: crossbeam_channel::Sender<()>,
|
||||||
) -> SynonymsDeletion
|
) -> SynonymsDeletion {
|
||||||
{
|
|
||||||
SynonymsDeletion {
|
SynonymsDeletion {
|
||||||
updates_store,
|
updates_store,
|
||||||
updates_results_store,
|
updates_results_store,
|
||||||
@ -36,9 +35,10 @@ impl SynonymsDeletion {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
|
||||||
where S: AsRef<str>,
|
where
|
||||||
T: AsRef<str>,
|
S: AsRef<str>,
|
||||||
I: Iterator<Item=T>,
|
T: AsRef<str>,
|
||||||
|
I: Iterator<Item = T>,
|
||||||
{
|
{
|
||||||
let synonym = normalize_str(synonym.as_ref());
|
let synonym = normalize_str(synonym.as_ref());
|
||||||
let value = self.synonyms.entry(synonym).or_insert(None);
|
let value = self.synonyms.entry(synonym).or_insert(None);
|
||||||
@ -66,8 +66,7 @@ pub fn push_synonyms_deletion(
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||||
) -> MResult<u64>
|
) -> MResult<u64> {
|
||||||
{
|
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::SynonymsDeletion(deletion);
|
let update = Update::SynonymsDeletion(deletion);
|
||||||
@ -81,8 +80,7 @@ pub fn apply_synonyms_deletion(
|
|||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
deletion: BTreeMap<String, Option<Vec<String>>>,
|
deletion: BTreeMap<String, Option<Vec<String>>>,
|
||||||
) -> MResult<()>
|
) -> MResult<()> {
|
||||||
{
|
|
||||||
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
let mut delete_whole_synonym_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for (synonym, alternatives) in deletion {
|
for (synonym, alternatives) in deletion {
|
||||||
@ -98,9 +96,7 @@ pub fn apply_synonyms_deletion(
|
|||||||
let alternatives = SetBuf::from_dirty(alternatives);
|
let alternatives = SetBuf::from_dirty(alternatives);
|
||||||
let mut builder = SetBuilder::memory();
|
let mut builder = SetBuilder::memory();
|
||||||
builder.extend_iter(alternatives).unwrap();
|
builder.extend_iter(alternatives).unwrap();
|
||||||
builder.into_inner()
|
builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
|
||||||
.and_then(fst::Set::from_bytes)
|
|
||||||
.unwrap()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
@ -124,7 +120,7 @@ pub fn apply_synonyms_deletion(
|
|||||||
} else {
|
} else {
|
||||||
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
|
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
None => {
|
None => {
|
||||||
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
delete_whole_synonym_builder.insert(&synonym).unwrap();
|
||||||
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
|
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
|
||||||
@ -150,7 +146,7 @@ pub fn apply_synonyms_deletion(
|
|||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
}
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,14 +1,26 @@
|
|||||||
use std::collections::{HashMap, BTreeMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::{fmt, u16};
|
|
||||||
use std::ops::BitOr;
|
use std::ops::BitOr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::{fmt, u16};
|
||||||
|
|
||||||
use serde::{Serialize, Deserialize};
|
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
|
pub const DISPLAYED: SchemaProps = SchemaProps {
|
||||||
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
|
displayed: true,
|
||||||
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
|
indexed: false,
|
||||||
|
ranked: false,
|
||||||
|
};
|
||||||
|
pub const INDEXED: SchemaProps = SchemaProps {
|
||||||
|
displayed: false,
|
||||||
|
indexed: true,
|
||||||
|
ranked: false,
|
||||||
|
};
|
||||||
|
pub const RANKED: SchemaProps = SchemaProps {
|
||||||
|
displayed: false,
|
||||||
|
indexed: false,
|
||||||
|
ranked: true,
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct SchemaProps {
|
pub struct SchemaProps {
|
||||||
@ -80,7 +92,13 @@ impl SchemaBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let identifier = self.identifier;
|
let identifier = self.identifier;
|
||||||
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
|
Schema {
|
||||||
|
inner: Arc::new(InnerSchema {
|
||||||
|
identifier,
|
||||||
|
attrs,
|
||||||
|
props,
|
||||||
|
}),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,7 +118,10 @@ impl Schema {
|
|||||||
fn to_builder(&self) -> SchemaBuilder {
|
fn to_builder(&self) -> SchemaBuilder {
|
||||||
let identifier = self.inner.identifier.clone();
|
let identifier = self.inner.identifier.clone();
|
||||||
let attributes = self.attributes_ordered();
|
let attributes = self.attributes_ordered();
|
||||||
SchemaBuilder { identifier, attributes }
|
SchemaBuilder {
|
||||||
|
identifier,
|
||||||
|
attributes,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
|
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
|
||||||
@ -136,18 +157,18 @@ impl Schema {
|
|||||||
name
|
name
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
|
pub fn iter<'a>(&'a self) -> impl Iterator<Item = (&str, SchemaAttr, SchemaProps)> + 'a {
|
||||||
self.inner.props.iter()
|
self.inner.props.iter().map(move |(name, prop)| {
|
||||||
.map(move |(name, prop)| {
|
let attr = self.inner.attrs.get(name).unwrap();
|
||||||
let attr = self.inner.attrs.get(name).unwrap();
|
(name.as_str(), *attr, *prop)
|
||||||
(name.as_str(), *attr, *prop)
|
})
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Serialize for Schema {
|
impl Serialize for Schema {
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
where S: serde::ser::Serializer,
|
where
|
||||||
|
S: serde::ser::Serializer,
|
||||||
{
|
{
|
||||||
self.to_builder().serialize(serializer)
|
self.to_builder().serialize(serializer)
|
||||||
}
|
}
|
||||||
@ -155,15 +176,15 @@ impl Serialize for Schema {
|
|||||||
|
|
||||||
impl<'de> Deserialize<'de> for Schema {
|
impl<'de> Deserialize<'de> for Schema {
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
where D: serde::de::Deserializer<'de>,
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
{
|
{
|
||||||
let builder = SchemaBuilder::deserialize(deserializer)?;
|
let builder = SchemaBuilder::deserialize(deserializer)?;
|
||||||
Ok(builder.build())
|
Ok(builder.build())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
|
||||||
pub struct SchemaAttr(pub u16);
|
pub struct SchemaAttr(pub u16);
|
||||||
|
|
||||||
impl SchemaAttr {
|
impl SchemaAttr {
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
use std::iter::Peekable;
|
|
||||||
use slice_group_by::StrGroupBy;
|
|
||||||
use self::SeparatorCategory::*;
|
use self::SeparatorCategory::*;
|
||||||
|
use slice_group_by::StrGroupBy;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
|
||||||
pub fn is_cjk(c: char) -> bool {
|
pub fn is_cjk(c: char) -> bool {
|
||||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
(c >= '\u{2e80}' && c <= '\u{2eff}')
|
||||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
|| (c >= '\u{2f00}' && c <= '\u{2fdf}')
|
||||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
|| (c >= '\u{3040}' && c <= '\u{309f}')
|
||||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
|| (c >= '\u{30a0}' && c <= '\u{30ff}')
|
||||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
|| (c >= '\u{3100}' && c <= '\u{312f}')
|
||||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
|| (c >= '\u{3200}' && c <= '\u{32ff}')
|
||||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
|| (c >= '\u{3400}' && c <= '\u{4dbf}')
|
||||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
|| (c >= '\u{4e00}' && c <= '\u{9fff}')
|
||||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
|| (c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
@ -22,7 +22,11 @@ enum SeparatorCategory {
|
|||||||
|
|
||||||
impl SeparatorCategory {
|
impl SeparatorCategory {
|
||||||
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
|
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
|
||||||
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
|
if let (Soft, Soft) = (self, other) {
|
||||||
|
Soft
|
||||||
|
} else {
|
||||||
|
Hard
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_usize(self) -> usize {
|
fn to_usize(self) -> usize {
|
||||||
@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool {
|
|||||||
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
||||||
match c {
|
match c {
|
||||||
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
||||||
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u
|
|||||||
(n + 1, i + c.len_utf8())
|
(n + 1, i + c.len_utf8())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
|
pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
|
||||||
Tokenizer::new(query).map(|t| t.word)
|
Tokenizer::new(query).map(|t| t.word)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> {
|
|||||||
pub fn new(string: &str) -> Tokenizer {
|
pub fn new(string: &str) -> Tokenizer {
|
||||||
// skip every separator and set `char_index`
|
// skip every separator and set `char_index`
|
||||||
// to the number of char trimmed
|
// to the number of char trimmed
|
||||||
let (count, index) = string.char_indices()
|
let (count, index) = string
|
||||||
.take_while(|(_, c)| is_separator(*c))
|
.char_indices()
|
||||||
.fold((0, 0), chars_count_index);
|
.take_while(|(_, c)| is_separator(*c))
|
||||||
|
.fold((0, 0), chars_count_index);
|
||||||
|
|
||||||
Tokenizer {
|
Tokenizer {
|
||||||
inner: &string[index..],
|
inner: &string[index..],
|
||||||
@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
|
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
|
||||||
|
|
||||||
if !is_str_word(string) {
|
if !is_str_word(string) {
|
||||||
self.word_index += string.chars()
|
self.word_index += string
|
||||||
.filter_map(classify_separator)
|
.chars()
|
||||||
.fold(Soft, |a, x| a.merge(x))
|
.filter_map(classify_separator)
|
||||||
.to_usize();
|
.fold(Soft, |a, x| a.merge(x))
|
||||||
|
.to_usize();
|
||||||
self.char_index += count;
|
self.char_index += count;
|
||||||
self.inner = &self.inner[index..];
|
self.inner = &self.inner[index..];
|
||||||
continue;
|
continue;
|
||||||
@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct SeqTokenizer<'a, I>
|
pub struct SeqTokenizer<'a, I>
|
||||||
where I: Iterator<Item=&'a str>,
|
where
|
||||||
|
I: Iterator<Item = &'a str>,
|
||||||
{
|
{
|
||||||
inner: I,
|
inner: I,
|
||||||
current: Option<Peekable<Tokenizer<'a>>>,
|
current: Option<Peekable<Tokenizer<'a>>>,
|
||||||
@ -162,7 +169,8 @@ where I: Iterator<Item=&'a str>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, I> SeqTokenizer<'a, I>
|
impl<'a, I> SeqTokenizer<'a, I>
|
||||||
where I: Iterator<Item=&'a str>,
|
where
|
||||||
|
I: Iterator<Item = &'a str>,
|
||||||
{
|
{
|
||||||
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
||||||
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
@ -176,7 +184,8 @@ where I: Iterator<Item=&'a str>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
||||||
where I: Iterator<Item=&'a str>,
|
where
|
||||||
|
I: Iterator<Item = &'a str>,
|
||||||
{
|
{
|
||||||
type Item = Token<'a>;
|
type Item = Token<'a>;
|
||||||
|
|
||||||
@ -202,15 +211,15 @@ where I: Iterator<Item=&'a str>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Some(token)
|
Some(token)
|
||||||
},
|
}
|
||||||
None => {
|
None => {
|
||||||
// no more words in this text we must
|
// no more words in this text we must
|
||||||
// start tokenizing the next text
|
// start tokenizing the next text
|
||||||
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
self.next()
|
self.next()
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
// no more texts available
|
// no more texts available
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
@ -225,12 +234,26 @@ mod tests {
|
|||||||
fn easy() {
|
fn easy() {
|
||||||
let mut tokenizer = Tokenizer::new("salut");
|
let mut tokenizer = Tokenizer::new("salut");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "salut",
|
||||||
|
word_index: 0,
|
||||||
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ");
|
let mut tokenizer = Tokenizer::new("yo ");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "yo",
|
||||||
|
word_index: 0,
|
||||||
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,19 +261,82 @@ mod tests {
|
|||||||
fn hard() {
|
fn hard() {
|
||||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
Some(Token {
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
word: "yo",
|
||||||
|
word_index: 0,
|
||||||
|
char_index: 4
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lolo",
|
||||||
|
word_index: 1,
|
||||||
|
char_index: 7
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "aïe",
|
||||||
|
word_index: 9,
|
||||||
|
char_index: 13
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "ouch",
|
||||||
|
word_index: 17,
|
||||||
|
char_index: 18
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
Some(Token {
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
|
word: "yo",
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
|
word_index: 0,
|
||||||
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lolo",
|
||||||
|
word_index: 8,
|
||||||
|
char_index: 5
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "wtf",
|
||||||
|
word_index: 16,
|
||||||
|
char_index: 12
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lol",
|
||||||
|
word_index: 17,
|
||||||
|
char_index: 18
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "aïe",
|
||||||
|
word_index: 25,
|
||||||
|
char_index: 24
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -258,18 +344,74 @@ mod tests {
|
|||||||
fn hard_long_chars() {
|
fn hard_long_chars() {
|
||||||
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
Some(Token {
|
||||||
|
word: "yo",
|
||||||
|
word_index: 0,
|
||||||
|
char_index: 4
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "😂",
|
||||||
|
word_index: 1,
|
||||||
|
char_index: 7
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "aïe",
|
||||||
|
word_index: 9,
|
||||||
|
char_index: 10
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
Some(Token {
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
|
word: "yo",
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
|
word_index: 0,
|
||||||
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lolo",
|
||||||
|
word_index: 8,
|
||||||
|
char_index: 5
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "😱",
|
||||||
|
word_index: 16,
|
||||||
|
char_index: 12
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lol",
|
||||||
|
word_index: 17,
|
||||||
|
char_index: 16
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "😣",
|
||||||
|
word_index: 25,
|
||||||
|
char_index: 22
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -277,19 +419,82 @@ mod tests {
|
|||||||
fn hard_kanjis() {
|
fn hard_kanjis() {
|
||||||
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
Some(Token {
|
||||||
|
word: "\u{2ec4}",
|
||||||
|
word_index: 0,
|
||||||
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lolilol",
|
||||||
|
word_index: 1,
|
||||||
|
char_index: 1
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "\u{2ec7}",
|
||||||
|
word_index: 2,
|
||||||
|
char_index: 8
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
assert_eq!(
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
tokenizer.next(),
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
Some(Token {
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
word: "\u{2ec4}",
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
|
word_index: 0,
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
|
char_index: 0
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "\u{2ed3}",
|
||||||
|
word_index: 1,
|
||||||
|
char_index: 1
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "\u{2ef2}",
|
||||||
|
word_index: 2,
|
||||||
|
char_index: 2
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "lolilol",
|
||||||
|
word_index: 3,
|
||||||
|
char_index: 4
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "hello",
|
||||||
|
word_index: 4,
|
||||||
|
char_index: 14
|
||||||
|
})
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tokenizer.next(),
|
||||||
|
Some(Token {
|
||||||
|
word: "\u{2ec7}",
|
||||||
|
word_index: 5,
|
||||||
|
char_index: 23
|
||||||
|
})
|
||||||
|
);
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user