mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-27 15:10:05 +01:00
Merge #236
236: Format the whole project r=Kerollmops a=irevoire I need to add `cargo fmt` in the CI before closing #231 Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
1bcf43baac
5
.rustfmt.toml
Normal file
5
.rustfmt.toml
Normal file
@ -0,0 +1,5 @@
|
||||
unstable_features = true
|
||||
|
||||
use_small_heuristics = "max"
|
||||
imports_granularity = "Module"
|
||||
group_imports = "StdExternalCrate"
|
15
README.md
15
README.md
@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head
|
||||
### Querying the engine via the website
|
||||
|
||||
You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700).
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if:
|
||||
- Any of the workspaces does not build
|
||||
- Your code is not well-formatted
|
||||
|
||||
These two things are also checked in the CI, so ignoring the hook won't help you merge your code.
|
||||
But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook.
|
||||
|
||||
To enable the hook, run the following command from the root of the project:
|
||||
```
|
||||
cp script/pre-commit .git/hooks/pre-commit
|
||||
```
|
||||
|
@ -6,33 +6,24 @@ use milli::update::Settings;
|
||||
use utils::Conf;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields = [
|
||||
"id", "title", "album", "artist", "genre", "country", "released", "duration",
|
||||
]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let displayed_fields =
|
||||
["id", "title", "album", "artist", "genre", "country", "released", "duration"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "album", "artist"]
|
||||
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = [
|
||||
"released-timestamp",
|
||||
"duration-float",
|
||||
"genre",
|
||||
"country",
|
||||
"artist",
|
||||
]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_filterable_fields(faceted_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_SONGS,
|
||||
queries: &[
|
||||
@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf {
|
||||
};
|
||||
|
||||
fn bench_songs(c: &mut criterion::Criterion) {
|
||||
let default_criterion: Vec<String> = milli::default_criteria()
|
||||
.iter()
|
||||
.map(|criteria| criteria.to_string())
|
||||
.collect();
|
||||
let default_criterion: Vec<String> =
|
||||
milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect();
|
||||
let default_criterion = default_criterion.iter().map(|s| s.as_str());
|
||||
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
|
||||
.chain(default_criterion.clone())
|
||||
.collect();
|
||||
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
|
||||
.chain(default_criterion.clone())
|
||||
.collect();
|
||||
let asc_default: Vec<&str> =
|
||||
std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect();
|
||||
let desc_default: Vec<&str> =
|
||||
std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect();
|
||||
|
||||
let basic_with_quote: Vec<String> = BASE_CONF
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| {
|
||||
s.trim()
|
||||
.split(' ')
|
||||
.map(|s| format!(r#""{}""#, s))
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
|
||||
})
|
||||
.collect();
|
||||
let basic_with_quote: &[&str] = &basic_with_quote
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<&str>>();
|
||||
let basic_with_quote: &[&str] =
|
||||
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
/* first we bench each criterion alone */
|
||||
utils::Conf {
|
||||
|
@ -3,10 +3,8 @@ use std::path::Path;
|
||||
|
||||
use criterion::BenchmarkId;
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::{
|
||||
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
|
||||
FilterCondition, Index,
|
||||
};
|
||||
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
|
||||
use milli::{FilterCondition, Index};
|
||||
|
||||
pub struct Conf<'a> {
|
||||
/// where we are going to create our database.mmdb directory
|
||||
|
@ -6,16 +6,14 @@ use milli::update::Settings;
|
||||
use utils::Conf;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields = ["title", "body", "url"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
|
||||
queries: &[
|
||||
@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) {
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| {
|
||||
s.trim()
|
||||
.split(' ')
|
||||
.map(|s| format!(r#""{}""#, s))
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
|
||||
})
|
||||
.collect();
|
||||
let basic_with_quote: &[&str] = &basic_with_quote
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<&str>>();
|
||||
let basic_with_quote: &[&str] =
|
||||
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
/* first we bench each criterion alone */
|
||||
utils::Conf {
|
||||
|
@ -1,9 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::{Cursor, Read, Seek, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{env, fs};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{Cursor, Read, Seek, Write},
|
||||
};
|
||||
|
||||
use bytes::Bytes;
|
||||
use convert_case::{Case, Casing};
|
||||
@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> {
|
||||
)?;
|
||||
|
||||
if out_file.exists() {
|
||||
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
|
||||
eprintln!(
|
||||
"The dataset {} already exists on the file system and will not be downloaded again",
|
||||
dataset
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
|
||||
@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
|
||||
let bytes = reqwest::blocking::Client::builder()
|
||||
.timeout(None)
|
||||
.build()?
|
||||
.get(url)
|
||||
.send()?
|
||||
.bytes()?;
|
||||
let bytes =
|
||||
reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?;
|
||||
Ok(Cursor::new(bytes))
|
||||
}
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use byte_unit::Byte;
|
||||
use heed::{Env, EnvOpenOptions, CompactionOption};
|
||||
use heed::{CompactionOption, Env, EnvOpenOptions};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use Command::*;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> {
|
||||
use CompactionOption::*;
|
||||
let compaction = if enable_compaction { Enabled } else { Disabled };
|
||||
copy_main_database_to_stdout(env, compaction)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
mod update_store;
|
||||
|
||||
use std::{io, mem};
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::fs::{create_dir_all, File};
|
||||
@ -10,16 +9,19 @@ use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use std::{io, mem};
|
||||
|
||||
use askama_warp::Template;
|
||||
use byte_unit::Byte;
|
||||
use either::Either;
|
||||
use flate2::read::GzDecoder;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use futures::stream;
|
||||
use futures::{stream, FutureExt, StreamExt};
|
||||
use grenad::CompressionType;
|
||||
use heed::EnvOpenOptions;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use milli::update::UpdateIndexingStep::*;
|
||||
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
|
||||
use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult};
|
||||
use once_cell::sync::OnceCell;
|
||||
use rayon::ThreadPool;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@ -28,12 +30,9 @@ use structopt::StructOpt;
|
||||
use tokio::fs::File as TFile;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::sync::broadcast;
|
||||
use warp::{Filter, http::Response};
|
||||
use warp::filters::ws::Message;
|
||||
|
||||
use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult};
|
||||
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
|
||||
use milli::update::UpdateIndexingStep::*;
|
||||
use warp::http::Response;
|
||||
use warp::Filter;
|
||||
|
||||
use self::update_store::UpdateStore;
|
||||
|
||||
@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
for (word, token) in analyzed.reconstruct() {
|
||||
if token.is_word() {
|
||||
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
|
||||
if to_highlight { string.push_str("<mark>") }
|
||||
if to_highlight {
|
||||
string.push_str("<mark>")
|
||||
}
|
||||
string.push_str(word);
|
||||
if to_highlight { string.push_str("</mark>") }
|
||||
if to_highlight {
|
||||
string.push_str("</mark>")
|
||||
}
|
||||
} else {
|
||||
string.push_str(word);
|
||||
}
|
||||
}
|
||||
Value::String(string)
|
||||
}
|
||||
Value::Array(values) => {
|
||||
Value::Array(values.into_iter()
|
||||
.map(|v| self.highlight_value(v, matching_words))
|
||||
.collect())
|
||||
}
|
||||
Value::Object(object) => {
|
||||
Value::Object(object.into_iter()
|
||||
Value::Array(values) => Value::Array(
|
||||
values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(),
|
||||
),
|
||||
Value::Object(object) => Value::Object(
|
||||
object
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, self.highlight_value(v, matching_words)))
|
||||
.collect())
|
||||
}
|
||||
.collect(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@ -236,12 +238,7 @@ enum UpdateMeta {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
enum UpdateMetaProgress {
|
||||
DocumentsAddition {
|
||||
step: usize,
|
||||
total_steps: usize,
|
||||
current: usize,
|
||||
total: Option<usize>,
|
||||
},
|
||||
DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option<usize> },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
@ -342,157 +339,185 @@ async fn main() -> anyhow::Result<()> {
|
||||
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
|
||||
update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size);
|
||||
update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type);
|
||||
update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
|
||||
update_builder
|
||||
.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
|
||||
|
||||
let before_update = Instant::now();
|
||||
// we extract the update type and execute the update itself.
|
||||
let result: anyhow::Result<()> = match meta {
|
||||
UpdateMeta::DocumentsAddition { method, format, encoding } => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned);
|
||||
let result: anyhow::Result<()> =
|
||||
match meta {
|
||||
UpdateMeta::DocumentsAddition { method, format, encoding } => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned);
|
||||
|
||||
match format.as_str() {
|
||||
"csv" => builder.update_format(UpdateFormat::Csv),
|
||||
"json" => builder.update_format(UpdateFormat::Json),
|
||||
"json-stream" => builder.update_format(UpdateFormat::JsonStream),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
|
||||
match method.as_str() {
|
||||
"replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
|
||||
"update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
|
||||
otherwise => panic!("invalid indexing method {:?}", otherwise),
|
||||
};
|
||||
|
||||
let reader = match encoding.as_deref() {
|
||||
Some("gzip") => Box::new(GzDecoder::new(content)),
|
||||
None => Box::new(content) as Box<dyn io::Read>,
|
||||
otherwise => panic!("invalid encoding format {:?}", otherwise),
|
||||
};
|
||||
|
||||
let result = builder.execute(reader, |indexing_step, update_id| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
|
||||
match format.as_str() {
|
||||
"csv" => builder.update_format(UpdateFormat::Csv),
|
||||
"json" => builder.update_format(UpdateFormat::Json),
|
||||
"json-stream" => builder.update_format(UpdateFormat::JsonStream),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(_) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
UpdateMeta::ClearDocuments => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let builder = update_builder.clear_documents(&mut wtxn, &index_cloned);
|
||||
|
||||
match builder.execute() {
|
||||
Ok(_count) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
UpdateMeta::Settings(settings) => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index_cloned);
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.searchable_attributes {
|
||||
Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes),
|
||||
Setting::Reset => builder.reset_searchable_fields(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.displayed_attributes {
|
||||
Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes),
|
||||
Setting::Reset => builder.reset_displayed_fields(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.filterable_attributes {
|
||||
Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes),
|
||||
Setting::Reset => builder.reset_filterable_fields(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.criteria {
|
||||
Setting::Set(criteria) => builder.set_criteria(criteria),
|
||||
Setting::Reset => builder.reset_criteria(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.stop_words {
|
||||
Setting::Set(stop_words) => builder.set_stop_words(stop_words),
|
||||
Setting::Reset => builder.reset_stop_words(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.synonyms {
|
||||
Setting::Set(synonyms) => builder.set_synonyms(synonyms),
|
||||
Setting::Reset => builder.reset_synonyms(),
|
||||
Setting::NotSet => ()
|
||||
}
|
||||
|
||||
let result = builder.execute(|indexing_step, update_id| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
|
||||
match method.as_str() {
|
||||
"replace" => builder
|
||||
.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
|
||||
"update" => builder
|
||||
.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
|
||||
otherwise => panic!("invalid indexing method {:?}", otherwise),
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(_count) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
let reader = match encoding.as_deref() {
|
||||
Some("gzip") => Box::new(GzDecoder::new(content)),
|
||||
None => Box::new(content) as Box<dyn io::Read>,
|
||||
otherwise => panic!("invalid encoding format {:?}", otherwise),
|
||||
};
|
||||
|
||||
let result = builder.execute(reader, |indexing_step, update_id| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => {
|
||||
(documents_seen, None)
|
||||
}
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
|
||||
(documents_seen, Some(total_documents))
|
||||
}
|
||||
IndexDocuments { documents_seen, total_documents } => {
|
||||
(documents_seen, Some(total_documents))
|
||||
}
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
|
||||
(databases_seen, Some(total_databases))
|
||||
}
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(_) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
UpdateMeta::Facets(levels) => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.facets(&mut wtxn, &index_cloned);
|
||||
if let Some(value) = levels.level_group_size {
|
||||
builder.level_group_size(value);
|
||||
UpdateMeta::ClearDocuments => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let builder = update_builder.clear_documents(&mut wtxn, &index_cloned);
|
||||
|
||||
match builder.execute() {
|
||||
Ok(_count) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
if let Some(value) = levels.min_level_size {
|
||||
builder.min_level_size(value);
|
||||
UpdateMeta::Settings(settings) => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index_cloned);
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.searchable_attributes {
|
||||
Setting::Set(searchable_attributes) => {
|
||||
builder.set_searchable_fields(searchable_attributes)
|
||||
}
|
||||
Setting::Reset => builder.reset_searchable_fields(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.displayed_attributes {
|
||||
Setting::Set(displayed_attributes) => {
|
||||
builder.set_displayed_fields(displayed_attributes)
|
||||
}
|
||||
Setting::Reset => builder.reset_displayed_fields(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.filterable_attributes {
|
||||
Setting::Set(filterable_attributes) => {
|
||||
builder.set_filterable_fields(filterable_attributes)
|
||||
}
|
||||
Setting::Reset => builder.reset_filterable_fields(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.criteria {
|
||||
Setting::Set(criteria) => builder.set_criteria(criteria),
|
||||
Setting::Reset => builder.reset_criteria(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.stop_words {
|
||||
Setting::Set(stop_words) => builder.set_stop_words(stop_words),
|
||||
Setting::Reset => builder.reset_stop_words(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
// We transpose the settings JSON struct into a real setting update.
|
||||
match settings.synonyms {
|
||||
Setting::Set(synonyms) => builder.set_synonyms(synonyms),
|
||||
Setting::Reset => builder.reset_synonyms(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
let result = builder.execute(|indexing_step, update_id| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => {
|
||||
(documents_seen, None)
|
||||
}
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
|
||||
(documents_seen, Some(total_documents))
|
||||
}
|
||||
IndexDocuments { documents_seen, total_documents } => {
|
||||
(documents_seen, Some(total_documents))
|
||||
}
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
|
||||
(databases_seen, Some(total_databases))
|
||||
}
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(_count) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
match builder.execute() {
|
||||
Ok(()) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
UpdateMeta::Facets(levels) => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.facets(&mut wtxn, &index_cloned);
|
||||
if let Some(value) = levels.level_group_size {
|
||||
builder.level_group_size(value);
|
||||
}
|
||||
if let Some(value) = levels.min_level_size {
|
||||
builder.min_level_size(value);
|
||||
}
|
||||
match builder.execute() {
|
||||
Ok(()) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
let meta = match result {
|
||||
Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()),
|
||||
Ok(()) => {
|
||||
format!("valid update content processed in {:.02?}", before_update.elapsed())
|
||||
}
|
||||
Err(e) => format!("error while processing update content: {:?}", e),
|
||||
};
|
||||
|
||||
@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
let _ = update_status_sender_cloned.send(processed);
|
||||
|
||||
Ok(meta)
|
||||
})?;
|
||||
},
|
||||
)?;
|
||||
|
||||
// The database name will not change.
|
||||
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
|
||||
@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
let db_name_cloned = db_name.clone();
|
||||
let lmdb_path_cloned = lmdb_path.clone();
|
||||
let index_cloned = index.clone();
|
||||
let dash_html_route = warp::filters::method::get()
|
||||
.and(warp::filters::path::end())
|
||||
.map(move || {
|
||||
let dash_html_route =
|
||||
warp::filters::method::get().and(warp::filters::path::end()).map(move || {
|
||||
// We retrieve the database size.
|
||||
let db_size = File::open(lmdb_path_cloned.clone())
|
||||
.unwrap()
|
||||
.metadata()
|
||||
.unwrap()
|
||||
.len() as usize;
|
||||
let db_size =
|
||||
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize;
|
||||
|
||||
// And the number of documents in the database.
|
||||
let rtxn = index_cloned.read_txn().unwrap();
|
||||
@ -537,111 +559,105 @@ async fn main() -> anyhow::Result<()> {
|
||||
.and(warp::path!("updates"))
|
||||
.map(move |header: String| {
|
||||
let update_store = update_store_cloned.clone();
|
||||
let mut updates = update_store.iter_metas(|processed, aborted, pending| {
|
||||
let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new();
|
||||
for result in processed {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Processed { update_id: uid.get(), meta });
|
||||
}
|
||||
for result in aborted {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta });
|
||||
}
|
||||
for result in pending {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Pending { update_id: uid.get(), meta });
|
||||
}
|
||||
Ok(updates)
|
||||
}).unwrap();
|
||||
let mut updates = update_store
|
||||
.iter_metas(|processed, aborted, pending| {
|
||||
let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new();
|
||||
for result in processed {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Processed { update_id: uid.get(), meta });
|
||||
}
|
||||
for result in aborted {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta });
|
||||
}
|
||||
for result in pending {
|
||||
let (uid, meta) = result?;
|
||||
updates.push(UpdateStatus::Pending { update_id: uid.get(), meta });
|
||||
}
|
||||
Ok(updates)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse());
|
||||
|
||||
if header.contains("text/html") {
|
||||
// We retrieve the database size.
|
||||
let db_size = File::open(lmdb_path_cloned.clone())
|
||||
.unwrap()
|
||||
.metadata()
|
||||
.unwrap()
|
||||
.len() as usize;
|
||||
let db_size =
|
||||
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len()
|
||||
as usize;
|
||||
|
||||
// And the number of documents in the database.
|
||||
let rtxn = index_cloned.read_txn().unwrap();
|
||||
let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize;
|
||||
|
||||
let template = UpdatesTemplate {
|
||||
db_name: db_name.clone(),
|
||||
db_size,
|
||||
docs_count,
|
||||
updates,
|
||||
};
|
||||
let template =
|
||||
UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates };
|
||||
Box::new(template) as Box<dyn warp::Reply>
|
||||
} else {
|
||||
Box::new(warp::reply::json(&updates))
|
||||
}
|
||||
});
|
||||
|
||||
let dash_bulma_route = warp::filters::method::get()
|
||||
.and(warp::path!("bulma.min.css"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "text/css; charset=utf-8")
|
||||
.body(include_str!("../public/bulma.min.css"))
|
||||
);
|
||||
let dash_bulma_route =
|
||||
warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "text/css; charset=utf-8")
|
||||
.body(include_str!("../public/bulma.min.css"))
|
||||
});
|
||||
|
||||
let dash_bulma_dark_route = warp::filters::method::get()
|
||||
.and(warp::path!("bulma-prefers-dark.min.css"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "text/css; charset=utf-8")
|
||||
.body(include_str!("../public/bulma-prefers-dark.min.css"))
|
||||
);
|
||||
let dash_bulma_dark_route =
|
||||
warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "text/css; charset=utf-8")
|
||||
.body(include_str!("../public/bulma-prefers-dark.min.css"))
|
||||
});
|
||||
|
||||
let dash_style_route = warp::filters::method::get()
|
||||
.and(warp::path!("style.css"))
|
||||
.map(|| Response::builder()
|
||||
let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "text/css; charset=utf-8")
|
||||
.body(include_str!("../public/style.css"))
|
||||
);
|
||||
});
|
||||
|
||||
let dash_jquery_route = warp::filters::method::get()
|
||||
.and(warp::path!("jquery-3.4.1.min.js"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/jquery-3.4.1.min.js"))
|
||||
);
|
||||
let dash_jquery_route =
|
||||
warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/jquery-3.4.1.min.js"))
|
||||
});
|
||||
|
||||
let dash_filesize_route = warp::filters::method::get()
|
||||
.and(warp::path!("filesize.min.js"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/filesize.min.js"))
|
||||
);
|
||||
let dash_filesize_route =
|
||||
warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/filesize.min.js"))
|
||||
});
|
||||
|
||||
let dash_script_route = warp::filters::method::get()
|
||||
.and(warp::path!("script.js"))
|
||||
.map(|| Response::builder()
|
||||
let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/script.js"))
|
||||
);
|
||||
});
|
||||
|
||||
let updates_script_route = warp::filters::method::get()
|
||||
.and(warp::path!("updates-script.js"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/updates-script.js"))
|
||||
);
|
||||
let updates_script_route =
|
||||
warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "application/javascript; charset=utf-8")
|
||||
.body(include_str!("../public/updates-script.js"))
|
||||
});
|
||||
|
||||
let dash_logo_white_route = warp::filters::method::get()
|
||||
.and(warp::path!("logo-white.svg"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "image/svg+xml")
|
||||
.body(include_str!("../public/logo-white.svg"))
|
||||
);
|
||||
let dash_logo_white_route =
|
||||
warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "image/svg+xml")
|
||||
.body(include_str!("../public/logo-white.svg"))
|
||||
});
|
||||
|
||||
let dash_logo_black_route = warp::filters::method::get()
|
||||
.and(warp::path!("logo-black.svg"))
|
||||
.map(|| Response::builder()
|
||||
.header("content-type", "image/svg+xml")
|
||||
.body(include_str!("../public/logo-black.svg"))
|
||||
);
|
||||
let dash_logo_black_route =
|
||||
warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| {
|
||||
Response::builder()
|
||||
.header("content-type", "image/svg+xml")
|
||||
.body(include_str!("../public/logo-black.svg"))
|
||||
});
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
search.filter(condition);
|
||||
}
|
||||
|
||||
let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
|
||||
let SearchResult { matching_words, candidates, documents_ids } =
|
||||
search.execute().unwrap();
|
||||
|
||||
let number_of_candidates = candidates.len();
|
||||
let facets = if query.facet_distribution == Some(true) {
|
||||
@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> {
|
||||
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
||||
if !disable_highlighting {
|
||||
highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
|
||||
highlighter.highlight_record(
|
||||
&mut object,
|
||||
&matching_words,
|
||||
&attributes_to_highlight,
|
||||
);
|
||||
}
|
||||
|
||||
documents.push(object);
|
||||
}
|
||||
|
||||
let answer = Answer {
|
||||
documents,
|
||||
number_of_candidates,
|
||||
facets: facets.unwrap_or_default(),
|
||||
};
|
||||
let answer =
|
||||
Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() };
|
||||
|
||||
Response::builder()
|
||||
.header("Content-Type", "application/json")
|
||||
@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
let index_cloned = index.clone();
|
||||
let document_route = warp::filters::method::get()
|
||||
.and(warp::path!("document" / String))
|
||||
.map(move |id: String| {
|
||||
let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map(
|
||||
move |id: String| {
|
||||
let index = index_cloned.clone();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> {
|
||||
match external_documents_ids.get(&id) {
|
||||
Some(document_id) => {
|
||||
let document_id = document_id as u32;
|
||||
let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
|
||||
let (_, obkv) =
|
||||
index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
|
||||
let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
||||
|
||||
Response::builder()
|
||||
.header("Content-Type", "application/json")
|
||||
.body(serde_json::to_string(&document).unwrap())
|
||||
}
|
||||
None => {
|
||||
Response::builder()
|
||||
.status(404)
|
||||
.body(format!("Document with id {:?} not found.", id))
|
||||
}
|
||||
None => Response::builder()
|
||||
.status(404)
|
||||
.body(format!("Document with id {:?} not found.", id)),
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
async fn buf_stream(
|
||||
update_store: Arc<UpdateStore<UpdateMeta, String>>,
|
||||
update_status_sender: broadcast::Sender<UpdateStatus<UpdateMeta, UpdateMetaProgress, String>>,
|
||||
update_status_sender: broadcast::Sender<
|
||||
UpdateStatus<UpdateMeta, UpdateMetaProgress, String>,
|
||||
>,
|
||||
update_method: Option<String>,
|
||||
update_format: UpdateFormat,
|
||||
encoding: Option<String>,
|
||||
mut stream: impl futures::Stream<Item=Result<impl bytes::Buf, warp::Error>> + Unpin,
|
||||
) -> Result<impl warp::Reply, warp::Rejection>
|
||||
{
|
||||
mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin,
|
||||
) -> Result<impl warp::Reply, warp::Rejection> {
|
||||
let file = tokio::task::block_in_place(tempfile::tempfile).unwrap();
|
||||
let mut file = TFile::from_std(file);
|
||||
|
||||
@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let update_store_cloned = update_store.clone();
|
||||
let update_status_sender_cloned = update_status_sender.clone();
|
||||
let clearing_route = warp::filters::method::post()
|
||||
.and(warp::path!("clear-documents"))
|
||||
.map(move || {
|
||||
let clearing_route =
|
||||
warp::filters::method::post().and(warp::path!("clear-documents")).map(move || {
|
||||
let meta = UpdateMeta::ClearDocuments;
|
||||
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
||||
@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let update_store_cloned = update_store.clone();
|
||||
let update_status_sender_cloned = update_status_sender.clone();
|
||||
let abort_pending_updates_route = warp::filters::method::delete()
|
||||
.and(warp::path!("updates"))
|
||||
.map(move || {
|
||||
let abort_pending_updates_route =
|
||||
warp::filters::method::delete().and(warp::path!("updates")).map(move || {
|
||||
let updates = update_store_cloned.abort_pendings().unwrap();
|
||||
for (update_id, meta) in updates {
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta });
|
||||
@ -930,25 +946,22 @@ async fn main() -> anyhow::Result<()> {
|
||||
warp::reply()
|
||||
});
|
||||
|
||||
let update_ws_route = warp::ws()
|
||||
.and(warp::path!("updates" / "ws"))
|
||||
.map(move |ws: warp::ws::Ws| {
|
||||
let update_ws_route =
|
||||
warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| {
|
||||
// And then our closure will be called when it completes...
|
||||
let update_status_receiver = update_status_sender.subscribe();
|
||||
ws.on_upgrade(|websocket| {
|
||||
// Just echo all updates messages...
|
||||
update_status_receiver
|
||||
.into_stream()
|
||||
.flat_map(|result| {
|
||||
match result {
|
||||
Ok(status) => {
|
||||
let msg = serde_json::to_string(&status).unwrap();
|
||||
stream::iter(Some(Ok(Message::text(msg))))
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("channel error: {:?}", e);
|
||||
stream::iter(None)
|
||||
}
|
||||
.flat_map(|result| match result {
|
||||
Ok(status) => {
|
||||
let msg = serde_json::to_string(&status).unwrap();
|
||||
stream::iter(Some(Ok(Message::text(msg))))
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("channel error: {:?}", e);
|
||||
stream::iter(None)
|
||||
}
|
||||
})
|
||||
.forward(websocket)
|
||||
@ -988,10 +1001,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use maplit::{btreeset,hashmap, hashset};
|
||||
use serde_test::{assert_tokens, Token};
|
||||
|
||||
use maplit::{btreeset, hashmap, hashset};
|
||||
use milli::update::Setting;
|
||||
use serde_test::{assert_tokens, Token};
|
||||
|
||||
use crate::Settings;
|
||||
|
||||
@ -1000,50 +1012,53 @@ mod tests {
|
||||
let settings = Settings {
|
||||
displayed_attributes: Setting::Set(vec!["name".to_string()]),
|
||||
searchable_attributes: Setting::Set(vec!["age".to_string()]),
|
||||
filterable_attributes: Setting::Set(hashset!{ "age".to_string() }),
|
||||
filterable_attributes: Setting::Set(hashset! { "age".to_string() }),
|
||||
criteria: Setting::Set(vec!["asc(age)".to_string()]),
|
||||
stop_words: Setting::Set(btreeset! { "and".to_string() }),
|
||||
synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] })
|
||||
synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }),
|
||||
};
|
||||
|
||||
assert_tokens(&settings, &[
|
||||
Token::Struct { name: "Settings", len: 6 },
|
||||
Token::Str("displayedAttributes"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("name"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("searchableAttributes"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("age"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("facetedAttributes"),
|
||||
Token::Some,
|
||||
Token::Map { len: Some(1) },
|
||||
Token::Str("age"),
|
||||
Token::Str("integer"),
|
||||
Token::MapEnd,
|
||||
Token::Str("criteria"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("asc(age)"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("stopWords"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("and"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("synonyms"),
|
||||
Token::Some,
|
||||
Token::Map { len: Some(1) },
|
||||
Token::Str("alex"),
|
||||
Token::Seq {len: Some(1) },
|
||||
Token::Str("alexey"),
|
||||
Token::SeqEnd,
|
||||
Token::MapEnd,
|
||||
Token::StructEnd,
|
||||
]);
|
||||
assert_tokens(
|
||||
&settings,
|
||||
&[
|
||||
Token::Struct { name: "Settings", len: 6 },
|
||||
Token::Str("displayedAttributes"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("name"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("searchableAttributes"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("age"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("facetedAttributes"),
|
||||
Token::Some,
|
||||
Token::Map { len: Some(1) },
|
||||
Token::Str("age"),
|
||||
Token::Str("integer"),
|
||||
Token::MapEnd,
|
||||
Token::Str("criteria"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("asc(age)"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("stopWords"),
|
||||
Token::Some,
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("and"),
|
||||
Token::SeqEnd,
|
||||
Token::Str("synonyms"),
|
||||
Token::Some,
|
||||
Token::Map { len: Some(1) },
|
||||
Token::Str("alex"),
|
||||
Token::Seq { len: Some(1) },
|
||||
Token::Str("alexey"),
|
||||
Token::SeqEnd,
|
||||
Token::MapEnd,
|
||||
Token::StructEnd,
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1057,22 +1072,25 @@ mod tests {
|
||||
synonyms: Setting::Reset,
|
||||
};
|
||||
|
||||
assert_tokens(&settings, &[
|
||||
Token::Struct { name: "Settings", len: 6 },
|
||||
Token::Str("displayedAttributes"),
|
||||
Token::None,
|
||||
Token::Str("searchableAttributes"),
|
||||
Token::None,
|
||||
Token::Str("facetedAttributes"),
|
||||
Token::None,
|
||||
Token::Str("criteria"),
|
||||
Token::None,
|
||||
Token::Str("stopWords"),
|
||||
Token::None,
|
||||
Token::Str("synonyms"),
|
||||
Token::None,
|
||||
Token::StructEnd,
|
||||
]);
|
||||
assert_tokens(
|
||||
&settings,
|
||||
&[
|
||||
Token::Struct { name: "Settings", len: 6 },
|
||||
Token::Str("displayedAttributes"),
|
||||
Token::None,
|
||||
Token::Str("searchableAttributes"),
|
||||
Token::None,
|
||||
Token::Str("facetedAttributes"),
|
||||
Token::None,
|
||||
Token::Str("criteria"),
|
||||
Token::None,
|
||||
Token::Str("stopWords"),
|
||||
Token::None,
|
||||
Token::Str("synonyms"),
|
||||
Token::None,
|
||||
Token::StructEnd,
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1086,9 +1104,6 @@ mod tests {
|
||||
synonyms: Setting::NotSet,
|
||||
};
|
||||
|
||||
assert_tokens(&settings, &[
|
||||
Token::Struct { name: "Settings", len: 0 },
|
||||
Token::StructEnd,
|
||||
]);
|
||||
assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]);
|
||||
}
|
||||
}
|
||||
|
@ -4,9 +4,9 @@ use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice};
|
||||
use heed::{EnvOpenOptions, Env, Database};
|
||||
use serde::{Serialize, Deserialize};
|
||||
use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
|
||||
use heed::{Database, Env, EnvOpenOptions};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||
|
||||
@ -25,7 +25,9 @@ pub trait UpdateHandler<M, N> {
|
||||
}
|
||||
|
||||
impl<M, N, F> UpdateHandler<M, N> for F
|
||||
where F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static {
|
||||
where
|
||||
F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static,
|
||||
{
|
||||
fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> {
|
||||
self(update_id, meta, content)
|
||||
}
|
||||
@ -82,26 +84,17 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
|
||||
/// Returns the new biggest id to use to store the new update.
|
||||
fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> {
|
||||
let last_pending = self.pending_meta
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.last(txn)?
|
||||
.map(|(k, _)| k.get());
|
||||
let last_pending =
|
||||
self.pending_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
|
||||
|
||||
let last_processed = self.processed_meta
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.last(txn)?
|
||||
.map(|(k, _)| k.get());
|
||||
let last_processed =
|
||||
self.processed_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
|
||||
|
||||
let last_aborted = self.aborted_meta
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.last(txn)?
|
||||
.map(|(k, _)| k.get());
|
||||
let last_aborted =
|
||||
self.aborted_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
|
||||
|
||||
let last_update_id = [last_pending, last_processed, last_aborted]
|
||||
.iter()
|
||||
.copied()
|
||||
.flatten()
|
||||
.max();
|
||||
let last_update_id =
|
||||
[last_pending, last_processed, last_aborted].iter().copied().flatten().max();
|
||||
|
||||
match last_update_id {
|
||||
Some(last_id) => Ok(last_id + 1),
|
||||
@ -112,7 +105,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
/// Registers the update content in the pending store and the meta
|
||||
/// into the pending-meta store. Returns the new unique update id.
|
||||
pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64>
|
||||
where M: Serialize,
|
||||
where
|
||||
M: Serialize,
|
||||
{
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
|
||||
@ -152,9 +146,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
// a reader while processing it, not a writer.
|
||||
match first_meta {
|
||||
Some((first_id, first_meta)) => {
|
||||
let first_content = self.pending
|
||||
.get(&rtxn, &first_id)?
|
||||
.expect("associated update content");
|
||||
let first_content =
|
||||
self.pending.get(&rtxn, &first_id)?.expect("associated update content");
|
||||
|
||||
// Process the pending update using the provided user function.
|
||||
let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?;
|
||||
@ -170,15 +163,16 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
wtxn.commit()?;
|
||||
|
||||
Ok(Some((first_id.get(), new_meta)))
|
||||
},
|
||||
None => Ok(None)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// The id and metadata of the update that is currently being processed,
|
||||
/// `None` if no update is being processed.
|
||||
pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>>
|
||||
where M: for<'a> Deserialize<'a>,
|
||||
where
|
||||
M: for<'a> Deserialize<'a>,
|
||||
{
|
||||
let rtxn = self.env.read_txn()?;
|
||||
match self.pending_meta.first(&rtxn)? {
|
||||
@ -242,7 +236,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
/// that as already been processed or which doesn't actually exist, will
|
||||
/// return `None`.
|
||||
pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>>
|
||||
where M: Serialize + for<'a> Deserialize<'a>,
|
||||
where
|
||||
M: Serialize + for<'a> Deserialize<'a>,
|
||||
{
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
let key = BEU64::new(update_id);
|
||||
@ -269,7 +264,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
|
||||
/// Aborts all the pending updates, and not the one being currently processed.
|
||||
/// Returns the update metas and ids that were successfully aborted.
|
||||
pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>>
|
||||
where M: Serialize + for<'a> Deserialize<'a>,
|
||||
where
|
||||
M: Serialize + for<'a> Deserialize<'a>,
|
||||
{
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
let mut aborted_updates = Vec::new();
|
||||
@ -303,17 +299,19 @@ pub enum UpdateStatusMeta<M, N> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let options = EnvOpenOptions::new();
|
||||
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| {
|
||||
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
|
||||
Ok(meta + " processed")
|
||||
}).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let meta = String::from("kiki");
|
||||
let update_id = update_store.register_update(&meta, &[]).unwrap();
|
||||
@ -329,10 +327,11 @@ mod tests {
|
||||
fn long_running_update() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let options = EnvOpenOptions::new();
|
||||
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| {
|
||||
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
|
||||
thread::sleep(Duration::from_millis(400));
|
||||
Ok(meta + " processed")
|
||||
}).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let before_register = Instant::now();
|
||||
|
||||
|
@ -1,16 +1,14 @@
|
||||
use std::fmt::Write as _;
|
||||
use std::path::PathBuf;
|
||||
use std::{str, io, fmt};
|
||||
use std::{fmt, io, str};
|
||||
|
||||
use anyhow::Context;
|
||||
use byte_unit::Byte;
|
||||
use heed::EnvOpenOptions;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use milli::facet::FacetType;
|
||||
use milli::index::db_name::*;
|
||||
use milli::{Index, TreeLevel};
|
||||
|
||||
use structopt::StructOpt;
|
||||
use Command::*;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> {
|
||||
WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words),
|
||||
WordsPrefixesDocids { full_display, prefixes } => {
|
||||
words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
|
||||
},
|
||||
}
|
||||
FacetNumbersDocids { full_display, field_name } => {
|
||||
facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name)
|
||||
},
|
||||
}
|
||||
FacetStringsDocids { full_display, field_name } => {
|
||||
facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name)
|
||||
},
|
||||
}
|
||||
WordsLevelPositionsDocids { full_display, words } => {
|
||||
words_level_positions_docids(&index, &rtxn, !full_display, words)
|
||||
},
|
||||
}
|
||||
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
|
||||
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
|
||||
},
|
||||
}
|
||||
FieldIdWordCountDocids { full_display, field_name } => {
|
||||
field_id_word_count_docids(&index, &rtxn, !full_display, field_name)
|
||||
},
|
||||
}
|
||||
DocidsWordsPositions { full_display, internal_documents_ids } => {
|
||||
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
|
||||
},
|
||||
}
|
||||
FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name),
|
||||
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
|
||||
AverageNumberOfPositionsByWord => {
|
||||
average_number_of_positions_by_word(&index, &rtxn)
|
||||
},
|
||||
AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn),
|
||||
SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases),
|
||||
DatabaseStats { database } => database_stats(&index, &rtxn, &database),
|
||||
WordPairProximitiesDocids { full_display, word1, word2 } => {
|
||||
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
|
||||
},
|
||||
}
|
||||
ExportWordsFst => export_words_fst(&index, &rtxn),
|
||||
ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn),
|
||||
ExportDocuments { internal_documents_ids } => {
|
||||
export_documents(&index, &rtxn, internal_documents_ids)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
||||
for result in index.word_docids.iter(rtxn)? {
|
||||
if limit == 0 { break }
|
||||
if limit == 0 {
|
||||
break;
|
||||
}
|
||||
let (word, docids) = result?;
|
||||
heap.push((Reverse(docids.len()), word));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
let stdout = io::stdout();
|
||||
@ -323,7 +323,7 @@ fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
|
||||
rtxn: &'txn heed::RoTxn,
|
||||
db: heed::Database<KC, DC>,
|
||||
field_id: u8,
|
||||
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
|
||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
|
||||
where
|
||||
KC: heed::BytesDecode<'txn>,
|
||||
DC: heed::BytesDecode<'txn>,
|
||||
@ -347,7 +347,8 @@ fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) ->
|
||||
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use heed::types::{Str, ByteSlice};
|
||||
|
||||
use heed::types::{ByteSlice, Str};
|
||||
|
||||
let Index {
|
||||
env: _env,
|
||||
@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
let words_fst = index.words_fst(rtxn)?;
|
||||
let length = words_fst.as_fst().as_bytes().len();
|
||||
heap.push(Reverse((length, format!("words-fst"), main_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
|
||||
// Fetch the word prefix FST
|
||||
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
|
||||
let length = words_prefixes_fst.as_fst().as_bytes().len();
|
||||
heap.push(Reverse((length, format!("words-prefixes-fst"), main_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
|
||||
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
|
||||
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let (word, value) = result?;
|
||||
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let (word, value) = result?;
|
||||
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((docid, word), value) = result?;
|
||||
let key = format!("{} {}", docid, word);
|
||||
heap.push(Reverse((value.len(), key, docid_word_positions_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word1, word2, prox), value) = result?;
|
||||
let key = format!("{} {} {}", word1, word2, prox);
|
||||
heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word, prefix, prox), value) = result?;
|
||||
let key = format!("{} {} {}", word, prefix, prox);
|
||||
heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word, level, left, right), value) = result?;
|
||||
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word, level, left, right), value) = result?;
|
||||
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((field_id, word_count), docids) = result?;
|
||||
let key = format!("{} {}", field_id, word_count);
|
||||
heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
||||
@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
write!(&mut output, " (level {})", level)?;
|
||||
let key = format!("{} {}", facet_name, output);
|
||||
heap.push(Reverse((value.len(), key, facet_id_f64_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// List the facet strings of this facet id.
|
||||
@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
let ((_fid, fvalue), value) = result?;
|
||||
let key = format!("{} {}", facet_name, fvalue);
|
||||
heap.push(Reverse((value.len(), key, facet_id_string_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let (id, value) = result?;
|
||||
heap.push(Reverse((value.len(), id.to_string(), documents_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
if heap.len() > limit {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
||||
fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> {
|
||||
fn words_docids(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
words: Vec<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["word", "documents_ids"])?;
|
||||
@ -523,8 +557,7 @@ fn words_prefixes_docids(
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
prefixes: Vec<String>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["prefix", "documents_ids"])?;
|
||||
@ -561,12 +594,12 @@ fn facet_values_docids(
|
||||
debug: bool,
|
||||
facet_type: FacetType,
|
||||
field_name: String,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
|
||||
|
||||
let field_id = fields_ids_map.id(&field_name)
|
||||
let field_id = fields_ids_map
|
||||
.id(&field_name)
|
||||
.with_context(|| format!("field {} not found", field_name))?;
|
||||
|
||||
if !faceted_fields.contains(&field_id) {
|
||||
@ -590,7 +623,7 @@ fn facet_values_docids(
|
||||
};
|
||||
wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?;
|
||||
}
|
||||
},
|
||||
}
|
||||
FacetType::String => {
|
||||
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
|
||||
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
|
||||
@ -614,8 +647,7 @@ fn words_level_positions_docids(
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
words: Vec<String>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
|
||||
@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids(
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
prefixes: Vec<String>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
|
||||
@ -691,21 +722,20 @@ fn field_id_word_count_docids(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
field_name: String
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
field_name: String,
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["field_name", "word_count", "docids"])?;
|
||||
|
||||
let field_id = index.fields_ids_map(rtxn)?
|
||||
let field_id = index
|
||||
.fields_ids_map(rtxn)?
|
||||
.id(&field_name)
|
||||
.with_context(|| format!("unknown field name: {}", &field_name))?;
|
||||
|
||||
let left = (field_id, 0);
|
||||
let right = (field_id, u8::max_value());
|
||||
let iter = index.field_id_word_count_docids
|
||||
.range(rtxn, &(left..=right))?;
|
||||
let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?;
|
||||
|
||||
for result in iter {
|
||||
let ((_, word_count), docids) = result?;
|
||||
@ -725,8 +755,7 @@ fn docids_words_positions(
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
internal_ids: Vec<u32>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["document_id", "word", "positions"])?;
|
||||
@ -734,9 +763,10 @@ fn docids_words_positions(
|
||||
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
|
||||
Box::new(index.docid_word_positions.iter(rtxn)?)
|
||||
} else {
|
||||
let vec: heed::Result<Vec<_>> = internal_ids.into_iter().map(|id| {
|
||||
index.docid_word_positions.prefix_iter(rtxn, &(id, ""))
|
||||
}).collect();
|
||||
let vec: heed::Result<Vec<_>> = internal_ids
|
||||
.into_iter()
|
||||
.map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, "")))
|
||||
.collect();
|
||||
Box::new(vec?.into_iter().flatten())
|
||||
};
|
||||
|
||||
@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) ->
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
|
||||
|
||||
let field_id = fields_ids_map.id(&field_name)
|
||||
let field_id = fields_ids_map
|
||||
.id(&field_name)
|
||||
.with_context(|| format!("field {} not found", field_name))?;
|
||||
|
||||
if !faceted_fields.contains(&field_id) {
|
||||
@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -> anyhow::Result<()> {
|
||||
fn export_documents(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
internal_ids: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::io::{BufWriter, Write as _};
|
||||
use milli::{BEU32, obkv_to_json};
|
||||
|
||||
use milli::{obkv_to_json, BEU32};
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut out = BufWriter::new(stdout);
|
||||
@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
|
||||
let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect();
|
||||
|
||||
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
|
||||
Box::new(index.documents.iter(rtxn)?.map(|result| {
|
||||
result.map(|(_id, obkv)| obkv)
|
||||
}))
|
||||
Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv)))
|
||||
} else {
|
||||
Box::new(internal_ids.into_iter().flat_map(|id| {
|
||||
index.documents.get(rtxn, &BEU32::new(id)).transpose()
|
||||
}))
|
||||
Box::new(
|
||||
internal_ids
|
||||
.into_iter()
|
||||
.flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()),
|
||||
)
|
||||
};
|
||||
|
||||
for result in iter {
|
||||
@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
|
||||
|
||||
fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||
use heed::types::DecodeIgnore;
|
||||
use milli::{DocumentId, BEU32StrCodec};
|
||||
use milli::{BEU32StrCodec, DocumentId};
|
||||
|
||||
let mut words_counts = Vec::new();
|
||||
let mut count = 0;
|
||||
let mut prev = None as Option<(DocumentId, u32)>;
|
||||
|
||||
let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
|
||||
let iter =
|
||||
index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
|
||||
for result in iter {
|
||||
let ((docid, _word), ()) = result?;
|
||||
|
||||
match prev.as_mut() {
|
||||
Some((prev_docid, prev_count)) if docid == *prev_docid => {
|
||||
*prev_count += 1;
|
||||
},
|
||||
}
|
||||
Some((prev_docid, prev_count)) => {
|
||||
words_counts.push(*prev_count);
|
||||
*prev_docid = docid;
|
||||
*prev_count = 0;
|
||||
count += 1;
|
||||
},
|
||||
}
|
||||
None => prev = Some((docid, 1)),
|
||||
}
|
||||
}
|
||||
@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
||||
|
||||
fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> {
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{Error, BytesDecode};
|
||||
use roaring::RoaringBitmap;
|
||||
use heed::{BytesDecode, Error};
|
||||
use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>(
|
||||
db: heed::PolyDatabase,
|
||||
rtxn: &'a heed::RoTxn,
|
||||
name: &str,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let mut key_size = 0u64;
|
||||
let mut val_size = 0u64;
|
||||
let mut values_length = Vec::new();
|
||||
@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
|
||||
WORD_DOCIDS => {
|
||||
let db = index.word_docids.as_polymorph();
|
||||
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
WORD_PREFIX_DOCIDS => {
|
||||
let db = index.word_prefix_docids.as_polymorph();
|
||||
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
DOCID_WORD_POSITIONS => {
|
||||
let db = index.docid_word_positions.as_polymorph();
|
||||
compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
WORD_PAIR_PROXIMITY_DOCIDS => {
|
||||
let db = index.word_pair_proximity_docids.as_polymorph();
|
||||
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => {
|
||||
let db = index.word_prefix_pair_proximity_docids.as_polymorph();
|
||||
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
FIELD_ID_WORD_COUNT_DOCIDS => {
|
||||
let db = index.field_id_word_count_docids.as_polymorph();
|
||||
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
|
||||
},
|
||||
}
|
||||
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
||||
}
|
||||
}
|
||||
@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids(
|
||||
debug: bool,
|
||||
word1: String,
|
||||
word2: String,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
use heed::types::ByteSlice;
|
||||
use milli::RoaringBitmapCodec;
|
||||
|
||||
@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids(
|
||||
|
||||
// Skip keys that are longer than the requested one,
|
||||
// a longer key means that the second word is a prefix of the request word.
|
||||
if key.len() != prefix.len() + 1 { continue; }
|
||||
if key.len() != prefix.len() + 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let proximity = key.last().unwrap();
|
||||
let docids = if debug {
|
||||
|
@ -1,15 +1,14 @@
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use regex::Regex;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::{Error, UserError};
|
||||
|
||||
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()
|
||||
});
|
||||
static ASC_DESC_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap());
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub enum Criterion {
|
||||
@ -52,17 +51,21 @@ impl FromStr for Criterion {
|
||||
"attribute" => Ok(Criterion::Attribute),
|
||||
"exactness" => Ok(Criterion::Exactness),
|
||||
text => {
|
||||
let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| {
|
||||
UserError::InvalidCriterionName { name: text.to_string() }
|
||||
})?;
|
||||
let caps = ASC_DESC_REGEX
|
||||
.captures(text)
|
||||
.ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?;
|
||||
let order = caps.get(1).unwrap().as_str();
|
||||
let field_name = caps.get(2).unwrap().as_str();
|
||||
match order {
|
||||
"asc" => Ok(Criterion::Asc(field_name.to_string())),
|
||||
"desc" => Ok(Criterion::Desc(field_name.to_string())),
|
||||
text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()),
|
||||
text => {
|
||||
return Err(
|
||||
UserError::InvalidCriterionName { name: text.to_string() }.into()
|
||||
)
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -82,13 +85,13 @@ impl fmt::Display for Criterion {
|
||||
use Criterion::*;
|
||||
|
||||
match self {
|
||||
Words => f.write_str("words"),
|
||||
Typo => f.write_str("typo"),
|
||||
Proximity => f.write_str("proximity"),
|
||||
Attribute => f.write_str("attribute"),
|
||||
Exactness => f.write_str("exactness"),
|
||||
Asc(attr) => write!(f, "asc({})", attr),
|
||||
Desc(attr) => write!(f, "desc({})", attr),
|
||||
Words => f.write_str("words"),
|
||||
Typo => f.write_str("typo"),
|
||||
Proximity => f.write_str("proximity"),
|
||||
Attribute => f.write_str("attribute"),
|
||||
Exactness => f.write_str("exactness"),
|
||||
Asc(attr) => write!(f, "asc({})", attr),
|
||||
Desc(attr) => write!(f, "desc({})", attr),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ use std::convert::Infallible;
|
||||
use std::error::Error as StdError;
|
||||
use std::{fmt, io, str};
|
||||
|
||||
use heed::{MdbError, Error as HeedError};
|
||||
use heed::{Error as HeedError, MdbError};
|
||||
use rayon::ThreadPoolBuildError;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
@ -80,14 +80,17 @@ impl From<fst::Error> for Error {
|
||||
}
|
||||
}
|
||||
|
||||
impl<E> From<grenad::Error<E>> for Error where Error: From<E> {
|
||||
impl<E> From<grenad::Error<E>> for Error
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
fn from(error: grenad::Error<E>) -> Error {
|
||||
match error {
|
||||
grenad::Error::Io(error) => Error::IoError(error),
|
||||
grenad::Error::Merge(error) => Error::from(error),
|
||||
grenad::Error::InvalidCompressionType => {
|
||||
Error::InternalError(InternalError::GrenadInvalidCompressionType)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -171,15 +174,15 @@ impl fmt::Display for InternalError {
|
||||
match self {
|
||||
Self::DatabaseMissingEntry { db_name, key } => {
|
||||
write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
|
||||
},
|
||||
}
|
||||
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
|
||||
Self::Fst(error) => error.fmt(f),
|
||||
Self::GrenadInvalidCompressionType => {
|
||||
f.write_str("invalid compression type have been specified to grenad")
|
||||
},
|
||||
}
|
||||
Self::IndexingMergingKeys { process } => {
|
||||
write!(f, "invalid merge while processing {}", process)
|
||||
},
|
||||
}
|
||||
Self::Serialization(error) => error.fmt(f),
|
||||
Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
|
||||
Self::RayonThreadPool(error) => error.fmt(f),
|
||||
@ -204,12 +207,12 @@ impl fmt::Display for UserError {
|
||||
Self::InvalidDocumentId { document_id } => {
|
||||
let json = serde_json::to_string(document_id).unwrap();
|
||||
write!(f, "document identifier is invalid {}", json)
|
||||
},
|
||||
}
|
||||
Self::InvalidFilterAttribute(error) => error.fmt(f),
|
||||
Self::MissingDocumentId { document } => {
|
||||
let json = serde_json::to_string(document).unwrap();
|
||||
write!(f, "document doesn't have an identifier {}", json)
|
||||
},
|
||||
}
|
||||
Self::MissingPrimaryKey => f.write_str("missing primary key"),
|
||||
Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
|
||||
// TODO where can we find it instead of writing the text ourselves?
|
||||
@ -217,14 +220,14 @@ impl fmt::Display for UserError {
|
||||
Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
|
||||
Self::PrimaryKeyCannotBeChanged => {
|
||||
f.write_str("primary key cannot be changed if the database contains documents")
|
||||
},
|
||||
}
|
||||
Self::PrimaryKeyCannotBeReset => {
|
||||
f.write_str("primary key cannot be reset if the database contains documents")
|
||||
},
|
||||
}
|
||||
Self::SerdeJson(error) => error.fmt(f),
|
||||
Self::UnknownInternalDocumentId { document_id } => {
|
||||
write!(f, "an unknown internal document id have been used ({})", document_id)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry {
|
||||
match self {
|
||||
Self::FieldId { field_id, process } => {
|
||||
write!(f, "unknown field id {} coming from the {} process", field_id, process)
|
||||
},
|
||||
}
|
||||
Self::FieldName { field_name, process } => {
|
||||
write!(f, "unknown field name {} coming from the {} process", field_name, process)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -251,11 +254,11 @@ impl fmt::Display for SerializationError {
|
||||
match self {
|
||||
Self::Decoding { db_name: Some(name) } => {
|
||||
write!(f, "decoding from the {} database failed", name)
|
||||
},
|
||||
}
|
||||
Self::Decoding { db_name: None } => f.write_str("decoding failed"),
|
||||
Self::Encoding { db_name: Some(name) } => {
|
||||
write!(f, "encoding into the {} database failed", name)
|
||||
},
|
||||
}
|
||||
Self::Encoding { db_name: None } => f.write_str("encoding failed"),
|
||||
Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"),
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
|
||||
pub struct ExternalDocumentsIds<'a> {
|
||||
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
||||
@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> {
|
||||
}
|
||||
|
||||
impl<'a> ExternalDocumentsIds<'a> {
|
||||
pub fn new(hard: fst::Map<Cow<'a, [u8]>>, soft: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> {
|
||||
pub fn new(
|
||||
hard: fst::Map<Cow<'a, [u8]>>,
|
||||
soft: fst::Map<Cow<'a, [u8]>>,
|
||||
) -> ExternalDocumentsIds<'a> {
|
||||
ExternalDocumentsIds { hard, soft }
|
||||
}
|
||||
|
||||
@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> {
|
||||
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
||||
// u64 MAX means deleted in the soft fst map
|
||||
Some(id) if id != u64::MAX => Some(id.try_into().unwrap()),
|
||||
_otherwise => None
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,10 +2,9 @@ use std::error::Error;
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum FacetType {
|
||||
String,
|
||||
Number,
|
||||
@ -43,4 +42,4 @@ impl fmt::Display for InvalidFacetType {
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for InvalidFacetType { }
|
||||
impl Error for InvalidFacetType {}
|
||||
|
@ -50,7 +50,7 @@ impl Serialize for FacetValue {
|
||||
FacetValue::Number(number) => {
|
||||
let string = number.to_string();
|
||||
serializer.serialize_str(&string)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering::Less;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn is_sorted<T: Ord>(x: &[T]) -> bool {
|
||||
@ -39,8 +40,8 @@ mod tests {
|
||||
let a = -13_f64;
|
||||
let b = -10.0;
|
||||
let c = -0.0;
|
||||
let d = 1.0;
|
||||
let e = 43.0;
|
||||
let d = 1.0;
|
||||
let e = 43.0;
|
||||
|
||||
let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect();
|
||||
assert!(is_sorted(&vec), "{:?}", vec);
|
||||
|
@ -1,5 +1,7 @@
|
||||
use std::collections::BTreeMap;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -11,11 +13,7 @@ pub struct FieldsIdsMap {
|
||||
|
||||
impl FieldsIdsMap {
|
||||
pub fn new() -> FieldsIdsMap {
|
||||
FieldsIdsMap {
|
||||
names_ids: BTreeMap::new(),
|
||||
ids_names: BTreeMap::new(),
|
||||
next_id: Some(0),
|
||||
}
|
||||
FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) }
|
||||
}
|
||||
|
||||
/// Returns the number of fields ids in the map.
|
||||
@ -62,17 +60,17 @@ impl FieldsIdsMap {
|
||||
}
|
||||
|
||||
/// Iterate over the ids and names in the ids order.
|
||||
pub fn iter(&self) -> impl Iterator<Item=(FieldId, &str)> {
|
||||
pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> {
|
||||
self.ids_names.iter().map(|(id, name)| (*id, name.as_str()))
|
||||
}
|
||||
|
||||
/// Iterate over the ids in the order of the ids.
|
||||
pub fn ids<'a>(&'a self) -> impl Iterator<Item=FieldId> + 'a {
|
||||
pub fn ids<'a>(&'a self) -> impl Iterator<Item = FieldId> + 'a {
|
||||
self.ids_names.keys().copied()
|
||||
}
|
||||
|
||||
/// Iterate over the names in the order of the ids.
|
||||
pub fn names(&self) -> impl Iterator<Item=&str> {
|
||||
pub fn names(&self) -> impl Iterator<Item = &str> {
|
||||
self.ids_names.values().map(AsRef::as_ref)
|
||||
}
|
||||
}
|
||||
|
@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::{BytesEncode, BytesDecode};
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::{FieldId, DocumentId};
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::{DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetF64Codec;
|
||||
|
||||
|
@ -2,12 +2,17 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use crate::{FieldId, DocumentId};
|
||||
use crate::{DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetStringCodec;
|
||||
|
||||
impl FieldDocIdFacetStringCodec {
|
||||
pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec<u8>) {
|
||||
pub fn serialize_into(
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
value: &str,
|
||||
out: &mut Vec<u8>,
|
||||
) {
|
||||
out.reserve(1 + 4 + value.len());
|
||||
out.push(field_id);
|
||||
out.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::{borrow::Cow, convert::TryInto};
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
|
@ -1,16 +1,18 @@
|
||||
mod beu32_str_codec;
|
||||
pub mod facet;
|
||||
mod field_id_word_count_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap;
|
||||
mod roaring_bitmap_length;
|
||||
mod str_level_position_codec;
|
||||
mod str_str_u8_codec;
|
||||
mod field_id_word_count_codec;
|
||||
pub mod facet;
|
||||
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
pub use self::roaring_bitmap_length::{
|
||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||
};
|
||||
pub use self::str_level_position_codec::StrLevelPositionCodec;
|
||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use obkv::{KvReader, KvWriter};
|
||||
|
||||
pub struct ObkvCodec;
|
||||
|
@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
use heed::{BytesEncode, BytesDecode};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub struct RoaringBitmapCodec;
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::io::{self, Read, BufRead};
|
||||
use std::io::{self, BufRead, Read};
|
||||
use std::mem;
|
||||
|
||||
use byteorder::{ReadBytesExt, LittleEndian};
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
|
||||
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
|
||||
const SERIAL_COOKIE: u16 = 12347;
|
||||
@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec {
|
||||
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
|
||||
(bytes.read_u32::<LittleEndian>()? as usize, true)
|
||||
} else if (cookie as u16) == SERIAL_COOKIE {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"run containers are unsupported",
|
||||
));
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
|
||||
} else {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
|
||||
}
|
||||
};
|
||||
|
||||
if size > u16::max_value() as usize + 1 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"size is greater than supported",
|
||||
));
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
|
||||
}
|
||||
|
||||
let mut description_bytes = vec![0u8; size * 4];
|
||||
@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmap_length() {
|
||||
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
||||
|
@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
|
||||
|
||||
if bytes.len() < footer_len { return None }
|
||||
if bytes.len() < footer_len {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let word = str::from_utf8(word).ok()?;
|
||||
|
@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet};
|
||||
use std::path::Path;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
|
||||
use heed::types::*;
|
||||
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::{UserError, FieldIdMapMissingEntry, InternalError};
|
||||
use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search};
|
||||
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result};
|
||||
use crate::{
|
||||
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
|
||||
FieldIdWordCountCodec,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
FacetValueStringCodec, FacetLevelValueF64Codec,
|
||||
};
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
|
||||
FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::{
|
||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec,
|
||||
FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search,
|
||||
StrLevelPositionCodec, StrStrU8Codec, BEU32,
|
||||
};
|
||||
|
||||
pub mod main_key {
|
||||
pub const CRITERIA_KEY: &str = "criteria";
|
||||
@ -114,14 +113,17 @@ impl Index {
|
||||
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
||||
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_prefix_pair_proximity_docids =
|
||||
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?;
|
||||
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
||||
let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?;
|
||||
let word_prefix_level_position_docids =
|
||||
env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?;
|
||||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
let documents = env.create_database(Some(DOCUMENTS))?;
|
||||
|
||||
Index::initialize_creation_dates(&env, main)?;
|
||||
@ -184,18 +186,26 @@ impl Index {
|
||||
/* documents ids */
|
||||
|
||||
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
|
||||
pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> {
|
||||
pub(crate) fn put_documents_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids)
|
||||
}
|
||||
|
||||
/// Returns the internal documents ids.
|
||||
pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
|
||||
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default())
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Returns the number of documents indexed in the database.
|
||||
pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> {
|
||||
let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
|
||||
let count =
|
||||
self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
|
||||
Ok(count.unwrap_or_default())
|
||||
}
|
||||
|
||||
@ -224,21 +234,30 @@ impl Index {
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
external_documents_ids: &ExternalDocumentsIds<'a>,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
let ExternalDocumentsIds { hard, soft } = external_documents_ids;
|
||||
let hard = hard.as_fst().as_bytes();
|
||||
let soft = soft.as_fst().as_bytes();
|
||||
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?;
|
||||
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?;
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
wtxn,
|
||||
main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
|
||||
hard,
|
||||
)?;
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
wtxn,
|
||||
main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
|
||||
soft,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the external documents ids map which associate the external ids
|
||||
/// with the internal ids (i.e. `u32`).
|
||||
pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
|
||||
let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let hard =
|
||||
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let soft =
|
||||
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let hard = match hard {
|
||||
Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
|
||||
None => fst::Map::default().map_data(Cow::Owned)?,
|
||||
@ -254,42 +273,62 @@ impl Index {
|
||||
|
||||
/// Writes the fields ids map which associate the documents keys with an internal field id
|
||||
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
|
||||
pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
|
||||
pub(crate) fn put_fields_ids_map(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
map: &FieldsIdsMap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map)
|
||||
}
|
||||
|
||||
/// Returns the fields ids map which associate the documents keys with an internal field id
|
||||
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
|
||||
pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> {
|
||||
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(
|
||||
rtxn,
|
||||
main_key::FIELDS_IDS_MAP_KEY,
|
||||
)?.unwrap_or_default())
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, main_key::FIELDS_IDS_MAP_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/* fields distribution */
|
||||
|
||||
/// Writes the fields distribution which associates every field name with
|
||||
/// the number of times it occurs in the documents.
|
||||
pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution)
|
||||
pub(crate) fn put_fields_distribution(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
distribution: &FieldsDistribution,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(
|
||||
wtxn,
|
||||
main_key::FIELDS_DISTRIBUTION_KEY,
|
||||
distribution,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the fields distribution which associates every field name with
|
||||
/// the number of times it occurs in the documents.
|
||||
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
|
||||
Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(
|
||||
rtxn,
|
||||
main_key::FIELDS_DISTRIBUTION_KEY,
|
||||
)?.unwrap_or_default())
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/* displayed fields */
|
||||
|
||||
/// Writes the fields that must be displayed in the defined order.
|
||||
/// There must be not be any duplicate field id.
|
||||
pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields)
|
||||
pub(crate) fn put_displayed_fields(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fields: &[&str],
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
|
||||
wtxn,
|
||||
main_key::DISPLAYED_FIELDS_KEY,
|
||||
&fields,
|
||||
)
|
||||
}
|
||||
|
||||
/// Deletes the displayed fields ids, this will make the engine to display
|
||||
@ -313,14 +352,17 @@ impl Index {
|
||||
for name in fields.into_iter() {
|
||||
match fields_ids_map.id(name) {
|
||||
Some(field_id) => fields_ids.push(field_id),
|
||||
None => return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "Index::displayed_fields_ids",
|
||||
}.into()),
|
||||
None => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "Index::displayed_fields_ids",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Some(fields_ids))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
@ -328,8 +370,16 @@ impl Index {
|
||||
/* searchable fields */
|
||||
|
||||
/// Writes the searchable fields, when this list is specified, only these are indexed.
|
||||
pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields)
|
||||
pub(crate) fn put_searchable_fields(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fields: &[&str],
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
|
||||
wtxn,
|
||||
main_key::SEARCHABLE_FIELDS_KEY,
|
||||
&fields,
|
||||
)
|
||||
}
|
||||
|
||||
/// Deletes the searchable fields, when no fields are specified, all fields are indexed.
|
||||
@ -352,14 +402,17 @@ impl Index {
|
||||
for name in fields {
|
||||
match fields_ids_map.id(name) {
|
||||
Some(field_id) => fields_ids.push(field_id),
|
||||
None => return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "Index::searchable_fields_ids",
|
||||
}.into()),
|
||||
None => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "Index::searchable_fields_ids",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Some(fields_ids))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
@ -367,7 +420,11 @@ impl Index {
|
||||
/* filterable fields */
|
||||
|
||||
/// Writes the filterable fields names in the database.
|
||||
pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> {
|
||||
pub(crate) fn put_filterable_fields(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fields: &HashSet<String>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields)
|
||||
}
|
||||
|
||||
@ -378,10 +435,10 @@ impl Index {
|
||||
|
||||
/// Returns the filterable fields names.
|
||||
pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
|
||||
Ok(self.main.get::<_, Str, SerdeJson<_>>(
|
||||
rtxn,
|
||||
main_key::FILTERABLE_FIELDS_KEY,
|
||||
)?.unwrap_or_default())
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Identical to `filterable_fields`, but returns ids instead.
|
||||
@ -394,11 +451,14 @@ impl Index {
|
||||
match fields_ids_map.id(&name) {
|
||||
Some(field_id) => {
|
||||
fields_ids.insert(field_id);
|
||||
},
|
||||
None => return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name,
|
||||
process: "Index::filterable_fields_ids",
|
||||
}.into()),
|
||||
}
|
||||
None => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name,
|
||||
process: "Index::filterable_fields_ids",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -413,9 +473,8 @@ impl Index {
|
||||
pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> {
|
||||
let filterable_fields = self.filterable_fields(rtxn)?;
|
||||
let distinct_field = self.distinct_field(rtxn)?;
|
||||
let asc_desc_fields = self.criteria(rtxn)?
|
||||
.into_iter()
|
||||
.filter_map(|criterion| match criterion {
|
||||
let asc_desc_fields =
|
||||
self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion {
|
||||
Criterion::Asc(field) | Criterion::Desc(field) => Some(field),
|
||||
_otherwise => None,
|
||||
});
|
||||
@ -439,11 +498,14 @@ impl Index {
|
||||
match fields_ids_map.id(&name) {
|
||||
Some(field_id) => {
|
||||
fields_ids.insert(field_id);
|
||||
},
|
||||
None => return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name,
|
||||
process: "Index::faceted_fields_ids",
|
||||
}.into()),
|
||||
}
|
||||
None => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name,
|
||||
process: "Index::faceted_fields_ids",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -458,8 +520,7 @@ impl Index {
|
||||
wtxn: &mut RwTxn,
|
||||
field_id: FieldId,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
@ -472,8 +533,7 @@ impl Index {
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap>
|
||||
{
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
@ -490,8 +550,7 @@ impl Index {
|
||||
wtxn: &mut RwTxn,
|
||||
field_id: FieldId,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
@ -504,8 +563,7 @@ impl Index {
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap>
|
||||
{
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
|
||||
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
|
||||
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
|
||||
@ -518,7 +576,11 @@ impl Index {
|
||||
|
||||
/* distinct field */
|
||||
|
||||
pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> {
|
||||
pub(crate) fn put_distinct_field(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
distinct_field: &str,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field)
|
||||
}
|
||||
|
||||
@ -532,7 +594,11 @@ impl Index {
|
||||
|
||||
/* criteria */
|
||||
|
||||
pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> {
|
||||
pub(crate) fn put_criteria(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
criteria: &[Criterion],
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria)
|
||||
}
|
||||
|
||||
@ -550,7 +616,11 @@ impl Index {
|
||||
/* words fst */
|
||||
|
||||
/// Writes the FST which is the words dictionary of the engine.
|
||||
pub(crate) fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
|
||||
pub(crate) fn put_words_fst<A: AsRef<[u8]>>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fst: &fst::Set<A>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes())
|
||||
}
|
||||
|
||||
@ -564,7 +634,11 @@ impl Index {
|
||||
|
||||
/* stop words */
|
||||
|
||||
pub(crate) fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
|
||||
pub(crate) fn put_stop_words<A: AsRef<[u8]>>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fst: &fst::Set<A>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes())
|
||||
}
|
||||
|
||||
@ -585,8 +659,7 @@ impl Index {
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
|
||||
}
|
||||
|
||||
@ -595,15 +668,17 @@ impl Index {
|
||||
}
|
||||
|
||||
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
|
||||
Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default())
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn words_synonyms<S: AsRef<str>>(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
words: &[S],
|
||||
) -> heed::Result<Option<Vec<Vec<String>>>>
|
||||
{
|
||||
) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||
Ok(self.synonyms(rtxn)?.remove(&words))
|
||||
}
|
||||
@ -611,8 +686,16 @@ impl Index {
|
||||
/* words prefixes fst */
|
||||
|
||||
/// Writes the FST which is the words prefixes dictionnary of the engine.
|
||||
pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes())
|
||||
pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
fst: &fst::Set<A>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
wtxn,
|
||||
main_key::WORDS_PREFIXES_FST_KEY,
|
||||
fst.as_fst().as_bytes(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the FST which is the words prefixes dictionnary of the engine.
|
||||
@ -637,13 +720,14 @@ impl Index {
|
||||
pub fn documents<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
ids: impl IntoIterator<Item=DocumentId>,
|
||||
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>>
|
||||
{
|
||||
ids: impl IntoIterator<Item = DocumentId>,
|
||||
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
|
||||
let mut documents = Vec::new();
|
||||
|
||||
for id in ids {
|
||||
let kv = self.documents.get(rtxn, &BEU32::new(id))?
|
||||
let kv = self
|
||||
.documents
|
||||
.get(rtxn, &BEU32::new(id))?
|
||||
.ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?;
|
||||
documents.push((id, kv));
|
||||
}
|
||||
@ -673,7 +757,8 @@ impl Index {
|
||||
|
||||
/// Returns the index creation time.
|
||||
pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
|
||||
Ok(self.main
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
@ -683,7 +768,8 @@ impl Index {
|
||||
|
||||
/// Returns the index last updated time.
|
||||
pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
|
||||
Ok(self.main
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
@ -691,7 +777,11 @@ impl Index {
|
||||
})?)
|
||||
}
|
||||
|
||||
pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime<Utc>) -> heed::Result<()> {
|
||||
pub(crate) fn set_updated_at(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
time: &DateTime<Utc>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time)
|
||||
}
|
||||
}
|
||||
@ -704,8 +794,8 @@ pub(crate) mod tests {
|
||||
use maplit::hashmap;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::Index;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use crate::Index;
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
inner: Index,
|
||||
@ -728,10 +818,7 @@ pub(crate) mod tests {
|
||||
options.map_size(100 * 4096);
|
||||
let _tempdir = TempDir::new_in(".").unwrap();
|
||||
let inner = Index::new(options, _tempdir.path()).unwrap();
|
||||
Self {
|
||||
inner,
|
||||
_tempdir
|
||||
}
|
||||
Self { inner, _tempdir }
|
||||
}
|
||||
}
|
||||
|
||||
@ -756,10 +843,13 @@ pub(crate) mod tests {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let fields_distribution = index.fields_distribution(&rtxn).unwrap();
|
||||
assert_eq!(fields_distribution, hashmap! {
|
||||
"id".to_string() => 2,
|
||||
"name".to_string() => 2,
|
||||
"age".to_string() => 1,
|
||||
});
|
||||
assert_eq!(
|
||||
fields_distribution,
|
||||
hashmap! {
|
||||
"id".to_string() => 2,
|
||||
"name".to_string() => 2,
|
||||
"age".to_string() => 1,
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,15 @@
|
||||
#[macro_use] extern crate pest_derive;
|
||||
#[macro_use]
|
||||
extern crate pest_derive;
|
||||
|
||||
mod criterion;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
mod fields_ids_map;
|
||||
mod search;
|
||||
pub mod facet;
|
||||
mod fields_ids_map;
|
||||
pub mod heed_codec;
|
||||
pub mod index;
|
||||
pub mod proximity;
|
||||
mod search;
|
||||
pub mod tree_level;
|
||||
pub mod update;
|
||||
|
||||
@ -20,15 +21,17 @@ use std::result::Result as StdResult;
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
pub use self::criterion::{Criterion, default_criteria};
|
||||
pub use self::criterion::{default_criteria, Criterion};
|
||||
pub use self::error::Error;
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||
pub use self::heed_codec::{
|
||||
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
|
||||
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
|
||||
RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
|
||||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords};
|
||||
pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult};
|
||||
pub use self::tree_level::TreeLevel;
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
@ -54,9 +57,9 @@ pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReader,
|
||||
) -> Result<Map<String, Value>>
|
||||
{
|
||||
displayed_fields.iter()
|
||||
) -> Result<Map<String, Value>> {
|
||||
displayed_fields
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
@ -72,7 +75,6 @@ pub fn obkv_to_json(
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
},
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn json_to_string_object() {
|
||||
let value = json!({
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::cmp;
|
||||
|
||||
use crate::{Attribute, Position};
|
||||
|
||||
const ONE_ATTRIBUTE: u32 = 1000;
|
||||
@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
|
||||
let (lhs_attr, lhs_index) = extract_position(lhs);
|
||||
let (rhs_attr, rhs_index) = extract_position(rhs);
|
||||
if lhs_attr != rhs_attr { MAX_DISTANCE }
|
||||
else { index_proximity(lhs_index, rhs_index) }
|
||||
if lhs_attr != rhs_attr {
|
||||
MAX_DISTANCE
|
||||
} else {
|
||||
index_proximity(lhs_index, rhs_index)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract_position(position: Position) -> (Attribute, Position) {
|
||||
|
@ -5,12 +5,12 @@ use log::debug;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
|
||||
use crate::search::facet::FacetIter;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::{FieldId, Index, Result};
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> {
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let field_id = fields_ids_map
|
||||
.id(&field_name)
|
||||
.ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
let field_id =
|
||||
fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: field_name.clone(),
|
||||
process: "AscDesc::new",
|
||||
})?;
|
||||
@ -101,44 +100,47 @@ impl<'t> Criterion for AscDesc<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree;
|
||||
let mut candidates = match (&self.query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => {
|
||||
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
|
||||
resolve_query_tree(&context, qt, params.wdcache)?
|
||||
},
|
||||
(None, None) => self.index.documents_ids(self.rtxn)?,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
self.query_tree = query_tree;
|
||||
let mut candidates = match (&self.query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => {
|
||||
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
|
||||
resolve_query_tree(&context, qt, params.wdcache)?
|
||||
}
|
||||
(None, None) => self.index.documents_ids(self.rtxn)?,
|
||||
};
|
||||
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
if candidates.is_empty() {
|
||||
continue;
|
||||
}
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
|
||||
self.allowed_candidates = &candidates - params.excluded_candidates;
|
||||
self.candidates = facet_ordered(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
self.field_id,
|
||||
self.ascending,
|
||||
candidates & &self.faceted_candidates,
|
||||
)?;
|
||||
},
|
||||
None => return Ok(None),
|
||||
if candidates.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.allowed_candidates = &candidates - params.excluded_candidates;
|
||||
self.candidates = facet_ordered(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
self.field_id,
|
||||
self.ascending,
|
||||
candidates & &self.faceted_candidates,
|
||||
)?;
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
Some(mut candidates) => {
|
||||
candidates -= params.excluded_candidates;
|
||||
@ -170,11 +172,8 @@ fn facet_ordered<'t>(
|
||||
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
|
||||
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::new_reducing
|
||||
} else {
|
||||
FacetIter::new_reverse_reducing
|
||||
};
|
||||
let facet_fn =
|
||||
if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing };
|
||||
let iter = facet_fn(rtxn, index, field_id, candidates)?;
|
||||
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
|
||||
}
|
||||
@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>(
|
||||
for docid in candidates.iter() {
|
||||
let left = (field_id, docid, f64::MIN);
|
||||
let right = (field_id, docid, f64::MAX);
|
||||
let mut iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.range(rtxn, &(left..=right))?;
|
||||
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
|
||||
let entry = if ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, OrderedFloat(value)));
|
||||
|
@ -1,15 +1,16 @@
|
||||
use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
|
||||
use std::collections::{BTreeMap, HashMap, btree_map};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::{self, Ordering};
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap};
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{TreeLevel, Result, search::build_dfa};
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::Query;
|
||||
use crate::search::query_tree::{Operation, QueryKind};
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree};
|
||||
use crate::search::{build_dfa, word_derivations, WordDerivationsCache};
|
||||
use crate::{Result, TreeLevel};
|
||||
|
||||
/// To be able to divide integers by the number of words in the query
|
||||
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
|
||||
@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
}
|
||||
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
|
||||
let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD {
|
||||
let current_buckets = match self.current_buckets.as_mut() {
|
||||
Some(current_buckets) => current_buckets,
|
||||
None => {
|
||||
let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?;
|
||||
let new_buckets = linear_compute_candidates(
|
||||
self.ctx,
|
||||
&flattened_query_tree,
|
||||
&allowed_candidates,
|
||||
)?;
|
||||
self.current_buckets.get_or_insert(new_buckets.into_iter())
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
match current_buckets.next() {
|
||||
@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? {
|
||||
match set_compute_candidates(
|
||||
self.ctx,
|
||||
&flattened_query_tree,
|
||||
&allowed_candidates,
|
||||
params.wdcache,
|
||||
)? {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
allowed_candidates -= &found_candidates;
|
||||
|
||||
self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
|
||||
self.state =
|
||||
Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
@ -109,39 +120,50 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
let flattened_query_tree = flatten_query_tree(&query_tree);
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
let flattened_query_tree = flatten_query_tree(&query_tree);
|
||||
|
||||
self.state = Some((query_tree, flattened_query_tree, candidates));
|
||||
self.current_buckets = None;
|
||||
},
|
||||
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => return Ok(None),
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
|
||||
self.state = Some((query_tree, flattened_query_tree, candidates));
|
||||
self.current_buckets = None;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
/// it will begin at the first non-empty interval and will return every interval without
|
||||
/// jumping over empty intervals.
|
||||
struct WordLevelIterator<'t, 'q> {
|
||||
inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
|
||||
inner: Box<
|
||||
dyn Iterator<Item = heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't,
|
||||
>,
|
||||
level: TreeLevel,
|
||||
interval_size: u32,
|
||||
word: Cow<'q, str>,
|
||||
@ -162,49 +186,80 @@ struct WordLevelIterator<'t, 'q> {
|
||||
}
|
||||
|
||||
impl<'t, 'q> WordLevelIterator<'t, 'q> {
|
||||
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
|
||||
fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
word: Cow<'q, str>,
|
||||
in_prefix_cache: bool,
|
||||
) -> heed::Result<Option<Self>> {
|
||||
match ctx.word_position_last_level(&word, in_prefix_cache)? {
|
||||
Some(level) => {
|
||||
Some(level) => {
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
|
||||
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
|
||||
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
|
||||
},
|
||||
let inner =
|
||||
ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
|
||||
Ok(Some(Self {
|
||||
inner,
|
||||
level,
|
||||
interval_size,
|
||||
word,
|
||||
in_prefix_cache,
|
||||
inner_next: None,
|
||||
current_interval: None,
|
||||
}))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
|
||||
fn dig(
|
||||
&self,
|
||||
ctx: &'t dyn Context<'t>,
|
||||
level: &TreeLevel,
|
||||
left_interval: Option<u32>,
|
||||
) -> heed::Result<Self> {
|
||||
let level = *level.min(&self.level);
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
|
||||
let word = self.word.clone();
|
||||
let in_prefix_cache = self.in_prefix_cache;
|
||||
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
|
||||
let inner =
|
||||
ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
|
||||
|
||||
Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
|
||||
Ok(Self {
|
||||
inner,
|
||||
level,
|
||||
interval_size,
|
||||
word,
|
||||
in_prefix_cache,
|
||||
inner_next: None,
|
||||
current_interval: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left }
|
||||
fn is_next_interval(last_right: u32, next_left: u32) -> bool {
|
||||
last_right + 1 == next_left
|
||||
}
|
||||
|
||||
let inner_next = match self.inner_next.take() {
|
||||
Some(inner_next) => Some(inner_next),
|
||||
None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)),
|
||||
None => self
|
||||
.inner
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|((_, _, left, right), docids)| (left, right, docids)),
|
||||
};
|
||||
|
||||
match inner_next {
|
||||
Some((left, right, docids)) => {
|
||||
match self.current_interval {
|
||||
Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
|
||||
let blank_left = last_left + self.interval_size;
|
||||
let blank_right = last_right + self.interval_size;
|
||||
self.current_interval = Some((blank_left, blank_right));
|
||||
self.inner_next = Some((left, right, docids));
|
||||
Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
|
||||
},
|
||||
_ => {
|
||||
self.current_interval = Some((left, right));
|
||||
Ok(Some((left, right, docids)))
|
||||
}
|
||||
Some((left, right, docids)) => match self.current_interval {
|
||||
Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
|
||||
let blank_left = last_left + self.interval_size;
|
||||
let blank_right = last_right + self.interval_size;
|
||||
self.current_interval = Some((blank_left, blank_right));
|
||||
self.inner_next = Some((left, right, docids));
|
||||
Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
|
||||
}
|
||||
_ => {
|
||||
self.current_interval = Some((left, right));
|
||||
Ok(Some((left, right, docids)))
|
||||
}
|
||||
},
|
||||
None => Ok(None),
|
||||
@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
queries: &'q [Query],
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Option<Self>>
|
||||
{
|
||||
) -> Result<Option<Self>> {
|
||||
let mut inner = Vec::with_capacity(queries.len());
|
||||
for query in queries {
|
||||
match &query.kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if !query.prefix || ctx.in_prefix_cache(&word) {
|
||||
let word = Cow::Borrowed(query.kind.word());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? {
|
||||
if let Some(word_level_iterator) =
|
||||
WordLevelIterator::new(ctx, word, query.prefix)?
|
||||
{
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
} else {
|
||||
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? {
|
||||
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?
|
||||
{
|
||||
let word = Cow::Owned(word.to_owned());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
|
||||
if let Some(word_level_iterator) =
|
||||
WordLevelIterator::new(ctx, word, false)?
|
||||
{
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? {
|
||||
for (word, _) in
|
||||
word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?
|
||||
{
|
||||
let word = Cow::Owned(word.to_owned());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)?
|
||||
{
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
}
|
||||
@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
Some(parent) => {
|
||||
let parent = parent.dig(ctx)?;
|
||||
(parent.level.min(self.level), Some(Box::new(parent)))
|
||||
},
|
||||
}
|
||||
None => (self.level.saturating_sub(1), None),
|
||||
};
|
||||
|
||||
let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten();
|
||||
let left_interval = self
|
||||
.accumulator
|
||||
.get(self.interval_to_skip)
|
||||
.map(|opt| opt.as_ref().map(|(left, _, _)| *left))
|
||||
.flatten();
|
||||
let mut inner = Vec::with_capacity(self.inner.len());
|
||||
for word_level_iterator in self.inner.iter() {
|
||||
inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
|
||||
}
|
||||
|
||||
Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0})
|
||||
Ok(Self {
|
||||
parent,
|
||||
inner,
|
||||
level,
|
||||
accumulator: vec![],
|
||||
parent_accumulator: vec![],
|
||||
interval_to_skip: 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
@ -305,12 +378,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
let wli_u8_level = Into::<u8>::into(wli.level);
|
||||
let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
|
||||
for _ in 0..accumulated_count {
|
||||
if let Some((next_left, _, next_docids)) = wli.next()? {
|
||||
accumulated = match accumulated.take(){
|
||||
if let Some((next_left, _, next_docids)) = wli.next()? {
|
||||
accumulated = match accumulated.take() {
|
||||
Some((acc_left, acc_right, mut acc_docids)) => {
|
||||
acc_docids |= next_docids;
|
||||
Some((acc_left, acc_right, acc_docids))
|
||||
},
|
||||
}
|
||||
None => Some((next_left, next_left + interval_size, next_docids)),
|
||||
};
|
||||
}
|
||||
@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
|
||||
/// return the next meta-interval created from inner WordLevelIterators,
|
||||
/// and from eventual chainned QueryLevelIterator.
|
||||
fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
fn next(
|
||||
&mut self,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
tree_level: TreeLevel,
|
||||
) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
let parent_result = match self.parent.as_mut() {
|
||||
Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
|
||||
None => None,
|
||||
@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
&self.parent_accumulator,
|
||||
&self.accumulator,
|
||||
self.interval_to_skip,
|
||||
allowed_candidates
|
||||
allowed_candidates,
|
||||
);
|
||||
self.accumulator.push(inner_next);
|
||||
self.parent_accumulator.push(parent_next);
|
||||
let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
|
||||
|
||||
for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) {
|
||||
for current in self
|
||||
.accumulator
|
||||
.iter()
|
||||
.rev()
|
||||
.zip(self.parent_accumulator.iter())
|
||||
.skip(self.interval_to_skip)
|
||||
{
|
||||
if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
|
||||
match merged_interval.as_mut() {
|
||||
Some((_, _, merged_docids)) => *merged_docids |= a & b,
|
||||
None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)),
|
||||
None => {
|
||||
merged_interval = Some((left_a + left_b, right_a + right_b, a & b))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(merged_interval)
|
||||
},
|
||||
}
|
||||
None => {
|
||||
let level = self.level;
|
||||
match self.inner_next(level)? {
|
||||
@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
|
||||
candidates &= allowed_candidates;
|
||||
Ok(Some((left, right, candidates)))
|
||||
|
||||
},
|
||||
}
|
||||
None => {
|
||||
self.accumulator = vec![None];
|
||||
Ok(None)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -379,16 +463,18 @@ fn interval_to_skip(
|
||||
already_skiped: usize,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> usize {
|
||||
parent_accumulator.iter()
|
||||
parent_accumulator
|
||||
.iter()
|
||||
.zip(current_accumulator.iter())
|
||||
.skip(already_skiped)
|
||||
.take_while(|(parent, current)| {
|
||||
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
|
||||
let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
|
||||
let skip_current = current
|
||||
.as_ref()
|
||||
.map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
|
||||
skip_parent && skip_current
|
||||
})
|
||||
.count()
|
||||
|
||||
}
|
||||
|
||||
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
|
||||
@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> {
|
||||
self.last_result = last_result;
|
||||
self.tree_level = tree_level;
|
||||
Ok(true)
|
||||
},
|
||||
}
|
||||
None => Ok(false),
|
||||
}
|
||||
}
|
||||
@ -429,7 +515,7 @@ impl<'t, 'q> Branch<'t, 'q> {
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
|
||||
let (left, right, _) = self.last_result;
|
||||
|
||||
self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new());
|
||||
self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new());
|
||||
}
|
||||
|
||||
/// return the score of the current inner interval.
|
||||
@ -477,31 +563,31 @@ fn initialize_query_level_iterators<'t, 'q>(
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<BinaryHeap<Branch<'t, 'q>>> {
|
||||
|
||||
let mut positions = BinaryHeap::with_capacity(branches.len());
|
||||
for branch in branches {
|
||||
let mut branch_positions = Vec::with_capacity(branch.len());
|
||||
for queries in branch {
|
||||
for queries in branch {
|
||||
match QueryLevelIterator::new(ctx, queries, wdcache)? {
|
||||
Some(qli) => branch_positions.push(qli),
|
||||
None => {
|
||||
// the branch seems to be invalid, so we skip it.
|
||||
branch_positions.clear();
|
||||
break;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
// QueryLevelIterator need to be sorted by level and folded in descending order.
|
||||
branch_positions.sort_unstable_by_key(|qli| qli.level);
|
||||
let folded_query_level_iterators = branch_positions
|
||||
.into_iter()
|
||||
.fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
|
||||
Some(fold) => {
|
||||
qli.parent(fold);
|
||||
Some(qli)
|
||||
},
|
||||
None => Some(qli),
|
||||
});
|
||||
let folded_query_level_iterators =
|
||||
branch_positions.into_iter().fold(None, |fold: Option<QueryLevelIterator>, mut qli| {
|
||||
match fold {
|
||||
Some(fold) => {
|
||||
qli.parent(fold);
|
||||
Some(qli)
|
||||
}
|
||||
None => Some(qli),
|
||||
}
|
||||
});
|
||||
|
||||
if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
|
||||
let tree_level = folded_query_level_iterators.level;
|
||||
@ -526,9 +612,9 @@ fn set_compute_candidates<'t>(
|
||||
branches: &FlattenedQueryTree,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
{
|
||||
let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
let mut branches_heap =
|
||||
initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
|
||||
let lowest_level = TreeLevel::min_value();
|
||||
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
|
||||
let mut allowed_candidates = allowed_candidates.clone();
|
||||
@ -539,15 +625,18 @@ fn set_compute_candidates<'t>(
|
||||
// if current is worst than best we break to return
|
||||
// candidates that correspond to the best rank
|
||||
if let Some((best_rank, _)) = final_candidates {
|
||||
if branch_rank > best_rank { break }
|
||||
if branch_rank > best_rank {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let _left = branch.last_result.0;
|
||||
let candidates = take(&mut branch.last_result.2);
|
||||
if candidates.is_empty() {
|
||||
// we don't have candidates, get next interval.
|
||||
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
|
||||
}
|
||||
else if is_lowest_level {
|
||||
if !branch.next(&allowed_candidates)? {
|
||||
PeekMut::pop(branch);
|
||||
}
|
||||
} else if is_lowest_level {
|
||||
// we have candidates, but we can't dig deeper.
|
||||
allowed_candidates -= &candidates;
|
||||
final_candidates = match final_candidates.take() {
|
||||
@ -556,19 +645,20 @@ fn set_compute_candidates<'t>(
|
||||
best_candidates |= candidates;
|
||||
branch.lazy_next();
|
||||
Some((best_rank, best_candidates))
|
||||
},
|
||||
}
|
||||
// we take current candidates as best candidates
|
||||
None => {
|
||||
branch.lazy_next();
|
||||
Some((branch_rank, candidates))
|
||||
},
|
||||
}
|
||||
};
|
||||
} else {
|
||||
// we have candidates, lets dig deeper in levels.
|
||||
branch.dig(ctx)?;
|
||||
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
|
||||
if !branch.next(&allowed_candidates)? {
|
||||
PeekMut::pop(branch);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Ok(final_candidates.map(|(_rank, candidates)| candidates))
|
||||
@ -578,9 +668,11 @@ fn linear_compute_candidates(
|
||||
ctx: &dyn Context,
|
||||
branches: &FlattenedQueryTree,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> Result<BTreeMap<u64, RoaringBitmap>>
|
||||
{
|
||||
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||
) -> Result<BTreeMap<u64, RoaringBitmap>> {
|
||||
fn compute_candidate_rank(
|
||||
branches: &FlattenedQueryTree,
|
||||
words_positions: HashMap<String, RoaringBitmap>,
|
||||
) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
let branch_len = branch.len();
|
||||
@ -593,17 +685,20 @@ fn linear_compute_candidates(
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
word_derivations(word, true, 0, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
.flat_map(|positions| positions.iter().next())
|
||||
.min()
|
||||
} else {
|
||||
words_positions.get(word)
|
||||
words_positions
|
||||
.get(word)
|
||||
.map(|positions| positions.iter().next())
|
||||
.flatten()
|
||||
}
|
||||
},
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
},
|
||||
.flat_map(|positions| positions.iter().next())
|
||||
.min()
|
||||
}
|
||||
};
|
||||
|
||||
match (position, current_position) {
|
||||
@ -627,9 +722,11 @@ fn linear_compute_candidates(
|
||||
branch_rank.sort_unstable();
|
||||
// because several words in same query can't match all a the position 0,
|
||||
// we substract the word index to the position.
|
||||
let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
|
||||
let branch_rank: u64 =
|
||||
branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
|
||||
// here we do the means of the words of the branch
|
||||
min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
|
||||
min_rank =
|
||||
min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
|
||||
}
|
||||
}
|
||||
|
||||
@ -641,8 +738,7 @@ fn linear_compute_candidates(
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||
{
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap> {
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
}
|
||||
}
|
||||
out
|
||||
},
|
||||
}
|
||||
None => recurse(head),
|
||||
}
|
||||
}
|
||||
|
||||
fn recurse(op: &Operation) -> FlattenedQueryTree {
|
||||
match op {
|
||||
And(ops) => {
|
||||
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
|
||||
},
|
||||
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
|
||||
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
|
||||
} else {
|
||||
ops.iter().map(recurse).flatten().collect()
|
||||
},
|
||||
And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)),
|
||||
Or(_, ops) => {
|
||||
if ops.iter().all(|op| op.query().is_some()) {
|
||||
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
|
||||
} else {
|
||||
ops.iter().map(recurse).flatten().collect()
|
||||
}
|
||||
}
|
||||
Phrase(words) => {
|
||||
let queries = words.iter().map(|word| {
|
||||
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
|
||||
}).collect();
|
||||
let queries = words
|
||||
.iter()
|
||||
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
|
||||
.collect();
|
||||
vec![queries]
|
||||
}
|
||||
Operation::Query(query) => vec![vec![vec![query.clone()]]],
|
||||
@ -712,28 +809,43 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
|
||||
use crate::search::criteria::QueryKind;
|
||||
use super::*;
|
||||
use crate::search::criteria::QueryKind;
|
||||
|
||||
#[test]
|
||||
fn simple_flatten_query_tree() {
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
]),
|
||||
]);
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
|
||||
Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("thefish")),
|
||||
}),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("the")),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("fish")),
|
||||
}),
|
||||
]),
|
||||
],
|
||||
),
|
||||
]),
|
||||
],
|
||||
);
|
||||
|
||||
let expected = vec![
|
||||
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],
|
||||
|
@ -2,19 +2,15 @@ use std::convert::TryFrom;
|
||||
use std::mem::take;
|
||||
use std::ops::BitOr;
|
||||
|
||||
use itertools::Itertools;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
||||
use crate::search::criteria::{
|
||||
Context,
|
||||
Criterion,
|
||||
CriterionParameters,
|
||||
CriterionResult,
|
||||
resolve_query_tree,
|
||||
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
||||
};
|
||||
use crate::{TreeLevel, Result};
|
||||
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
||||
use crate::{Result, TreeLevel};
|
||||
|
||||
pub struct Exactness<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
@ -26,7 +22,11 @@ pub struct Exactness<'t> {
|
||||
}
|
||||
|
||||
impl<'t> Exactness<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>, primitive_query: &[PrimitiveQueryPart]) -> heed::Result<Self> {
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
primitive_query: &[PrimitiveQueryPart],
|
||||
) -> heed::Result<Self> {
|
||||
let mut query: Vec<_> = Vec::with_capacity(primitive_query.len());
|
||||
for part in primitive_query {
|
||||
query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?);
|
||||
@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> {
|
||||
// reset state
|
||||
self.state = None;
|
||||
self.query_tree = None;
|
||||
},
|
||||
}
|
||||
Some(state) => {
|
||||
let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?;
|
||||
self.state = state;
|
||||
@ -70,40 +70,51 @@ impl<'t> Criterion for Exactness<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
self.state = Some(State::new(candidates));
|
||||
self.query_tree = Some(query_tree);
|
||||
},
|
||||
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => return Ok(None),
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
|
||||
self.state = Some(State::new(candidates));
|
||||
self.query_tree = Some(query_tree);
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -125,9 +136,9 @@ impl State {
|
||||
|
||||
fn difference_with(&mut self, lhs: &RoaringBitmap) {
|
||||
match self {
|
||||
Self::ExactAttribute(candidates) |
|
||||
Self::AttributeStartsWith(candidates) |
|
||||
Self::ExactWords(candidates) => *candidates -= lhs,
|
||||
Self::ExactAttribute(candidates)
|
||||
| Self::AttributeStartsWith(candidates)
|
||||
| Self::ExactWords(candidates) => *candidates -= lhs,
|
||||
Self::Remainings(candidates_array) => {
|
||||
candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs);
|
||||
candidates_array.retain(|candidates| !candidates.is_empty());
|
||||
@ -137,9 +148,9 @@ impl State {
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
Self::ExactAttribute(candidates) |
|
||||
Self::AttributeStartsWith(candidates) |
|
||||
Self::ExactWords(candidates) => candidates.is_empty(),
|
||||
Self::ExactAttribute(candidates)
|
||||
| Self::AttributeStartsWith(candidates)
|
||||
| Self::ExactWords(candidates) => candidates.is_empty(),
|
||||
Self::Remainings(candidates_array) => {
|
||||
candidates_array.iter().all(RoaringBitmap::is_empty)
|
||||
}
|
||||
@ -158,8 +169,7 @@ fn resolve_state(
|
||||
ctx: &dyn Context,
|
||||
state: State,
|
||||
query: &[ExactQueryPart],
|
||||
) -> Result<(RoaringBitmap, Option<State>)>
|
||||
{
|
||||
) -> Result<(RoaringBitmap, Option<State>)> {
|
||||
use State::*;
|
||||
match state {
|
||||
ExactAttribute(mut allowed_candidates) => {
|
||||
@ -167,8 +177,11 @@ fn resolve_state(
|
||||
if let Ok(query_len) = u8::try_from(query.len()) {
|
||||
let attributes_ids = ctx.searchable_fields_ids()?;
|
||||
for id in attributes_ids {
|
||||
if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? {
|
||||
let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?;
|
||||
if let Some(attribute_allowed_docids) =
|
||||
ctx.field_id_word_count_docids(id, query_len)?
|
||||
{
|
||||
let mut attribute_candidates_array =
|
||||
attribute_start_with_docids(ctx, id as u32, query)?;
|
||||
attribute_candidates_array.push(attribute_allowed_docids);
|
||||
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
||||
}
|
||||
@ -181,12 +194,13 @@ fn resolve_state(
|
||||
}
|
||||
|
||||
Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
|
||||
},
|
||||
}
|
||||
AttributeStartsWith(mut allowed_candidates) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let attributes_ids = ctx.searchable_fields_ids()?;
|
||||
for id in attributes_ids {
|
||||
let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?;
|
||||
let attribute_candidates_array =
|
||||
attribute_start_with_docids(ctx, id as u32, query)?;
|
||||
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
||||
}
|
||||
|
||||
@ -195,7 +209,7 @@ fn resolve_state(
|
||||
// remove current candidates from allowed candidates
|
||||
allowed_candidates -= &candidates;
|
||||
Ok((candidates, Some(ExactWords(allowed_candidates))))
|
||||
},
|
||||
}
|
||||
ExactWords(mut allowed_candidates) => {
|
||||
let number_of_part = query.len();
|
||||
let mut parts_candidates_array = Vec::with_capacity(number_of_part);
|
||||
@ -210,7 +224,7 @@ fn resolve_state(
|
||||
candidates |= synonym_candidates;
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
// compute intersection on pair of words with a proximity of 0.
|
||||
Phrase(phrase) => {
|
||||
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
|
||||
@ -220,8 +234,8 @@ fn resolve_state(
|
||||
Some(docids) => bitmaps.push(docids),
|
||||
None => {
|
||||
bitmaps.clear();
|
||||
break
|
||||
},
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -247,7 +261,7 @@ fn resolve_state(
|
||||
// intersect each word candidates in combinations
|
||||
.map(intersection_of)
|
||||
// union combinations of `c_count` exact words
|
||||
.fold(RoaringBitmap::new(), RoaringBitmap::bitor);
|
||||
.fold(RoaringBitmap::new(), RoaringBitmap::bitor);
|
||||
// only keep allowed candidates
|
||||
combinations_candidates &= &allowed_candidates;
|
||||
// remove current candidates from allowed candidates
|
||||
@ -261,7 +275,7 @@ fn resolve_state(
|
||||
candidates_array.reverse();
|
||||
|
||||
Ok((all_exact_candidates, Some(Remainings(candidates_array))))
|
||||
},
|
||||
}
|
||||
// pop remainings candidates until the emptiness
|
||||
Remainings(mut candidates_array) => {
|
||||
let candidates = candidates_array.pop().unwrap_or_default();
|
||||
@ -270,12 +284,15 @@ fn resolve_state(
|
||||
} else {
|
||||
Ok((candidates, None))
|
||||
}
|
||||
},
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result<Vec<RoaringBitmap>> {
|
||||
fn attribute_start_with_docids(
|
||||
ctx: &dyn Context,
|
||||
attribute_id: u32,
|
||||
query: &[ExactQueryPart],
|
||||
) -> heed::Result<Vec<RoaringBitmap>> {
|
||||
let lowest_level = TreeLevel::min_value();
|
||||
let mut attribute_candidates_array = Vec::new();
|
||||
// start from attribute first position
|
||||
@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex
|
||||
}
|
||||
attribute_candidates_array.push(synonyms_candidates);
|
||||
pos += 1;
|
||||
},
|
||||
}
|
||||
Phrase(phrase) => {
|
||||
for word in phrase {
|
||||
let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?;
|
||||
@ -325,24 +342,30 @@ pub enum ExactQueryPart {
|
||||
}
|
||||
|
||||
impl ExactQueryPart {
|
||||
fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result<Self> {
|
||||
fn from_primitive_query_part(
|
||||
ctx: &dyn Context,
|
||||
part: &PrimitiveQueryPart,
|
||||
) -> heed::Result<Self> {
|
||||
let part = match part {
|
||||
PrimitiveQueryPart::Word(word, _) => {
|
||||
match ctx.synonyms(word)? {
|
||||
Some(synonyms) => {
|
||||
let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| {
|
||||
// keep 1 word synonyms only.
|
||||
match array.pop() {
|
||||
Some(word) if array.is_empty() => Some(word),
|
||||
_ => None,
|
||||
}
|
||||
}).collect();
|
||||
let mut synonyms: Vec<_> = synonyms
|
||||
.into_iter()
|
||||
.filter_map(|mut array| {
|
||||
// keep 1 word synonyms only.
|
||||
match array.pop() {
|
||||
Some(word) if array.is_empty() => Some(word),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
synonyms.push(word.clone());
|
||||
ExactQueryPart::Synonyms(synonyms)
|
||||
},
|
||||
}
|
||||
None => ExactQueryPart::Synonyms(vec![word.clone()]),
|
||||
}
|
||||
},
|
||||
}
|
||||
PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()),
|
||||
};
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::Result;
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::WordDerivationsCache;
|
||||
use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context};
|
||||
use crate::Result;
|
||||
|
||||
/// The result of a call to the fetcher.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
@ -26,7 +26,12 @@ pub struct Final<'t> {
|
||||
|
||||
impl<'t> Final<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
|
||||
Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() }
|
||||
Final {
|
||||
ctx,
|
||||
parent,
|
||||
wdcache: WordDerivationsCache::new(),
|
||||
returned_candidates: RoaringBitmap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("Final::{}")]
|
||||
@ -40,10 +45,17 @@ impl<'t> Final<'t> {
|
||||
};
|
||||
|
||||
match self.parent.next(&mut criterion_parameters)? {
|
||||
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
let mut candidates = match (candidates, query_tree.as_ref()) {
|
||||
(Some(candidates), _) => candidates,
|
||||
(None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates,
|
||||
(None, Some(qt)) => {
|
||||
resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates
|
||||
}
|
||||
(None, None) => self.ctx.documents_ids()? - excluded_candidates,
|
||||
};
|
||||
|
||||
@ -56,7 +68,7 @@ impl<'t> Final<'t> {
|
||||
self.returned_candidates |= &candidates;
|
||||
|
||||
Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }))
|
||||
},
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,18 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::Result;
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::query_tree::Operation;
|
||||
use super::{Criterion, CriterionResult, CriterionParameters};
|
||||
use crate::Result;
|
||||
|
||||
pub struct Initial {
|
||||
answer: Option<CriterionResult>
|
||||
answer: Option<CriterionResult>,
|
||||
}
|
||||
|
||||
impl Initial {
|
||||
pub fn new(query_tree: Option<Operation>, filtered_candidates: Option<RoaringBitmap>) -> Initial {
|
||||
pub fn new(
|
||||
query_tree: Option<Operation>,
|
||||
filtered_candidates: Option<RoaringBitmap>,
|
||||
) -> Initial {
|
||||
let answer = CriterionResult {
|
||||
query_tree,
|
||||
candidates: None,
|
||||
|
@ -1,29 +1,28 @@
|
||||
use std::collections::HashMap;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
|
||||
use crate::{Index, DocumentId, Result};
|
||||
|
||||
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
|
||||
use self::asc_desc::AscDesc;
|
||||
use self::attribute::Attribute;
|
||||
use self::exactness::Exactness;
|
||||
use self::r#final::Final;
|
||||
use self::initial::Initial;
|
||||
use self::proximity::Proximity;
|
||||
use self::r#final::Final;
|
||||
use self::typo::Typo;
|
||||
use self::words::Words;
|
||||
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use crate::{DocumentId, FieldId, Index, Result, TreeLevel};
|
||||
|
||||
mod asc_desc;
|
||||
mod attribute;
|
||||
mod exactness;
|
||||
pub mod r#final;
|
||||
mod initial;
|
||||
mod proximity;
|
||||
mod typo;
|
||||
mod words;
|
||||
pub mod r#final;
|
||||
|
||||
pub trait Criterion {
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>;
|
||||
@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> {
|
||||
#[derive(Debug)]
|
||||
enum Candidates {
|
||||
Allowed(RoaringBitmap),
|
||||
Forbidden(RoaringBitmap)
|
||||
Forbidden(RoaringBitmap),
|
||||
}
|
||||
|
||||
impl Default for Candidates {
|
||||
@ -68,17 +67,55 @@ pub trait Context<'c> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
|
||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
|
||||
fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>;
|
||||
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
|
||||
fn docid_words_positions(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
) -> heed::Result<HashMap<String, RoaringBitmap>>;
|
||||
fn word_position_iterator(
|
||||
&self,
|
||||
word: &str,
|
||||
level: TreeLevel,
|
||||
in_prefix_cache: bool,
|
||||
left: Option<u32>,
|
||||
right: Option<u32>,
|
||||
) -> heed::Result<
|
||||
Box<
|
||||
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
|
||||
>,
|
||||
>;
|
||||
fn word_position_last_level(
|
||||
&self,
|
||||
word: &str,
|
||||
in_prefix_cache: bool,
|
||||
) -> heed::Result<Option<TreeLevel>>;
|
||||
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
|
||||
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
|
||||
fn field_id_word_count_docids(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
word_count: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_level_position_docids(
|
||||
&self,
|
||||
word: &str,
|
||||
level: TreeLevel,
|
||||
left: u32,
|
||||
right: u32,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
}
|
||||
|
||||
pub struct CriteriaBuilder<'t> {
|
||||
@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
self.index.word_prefix_docids.get(self.rtxn, &word)
|
||||
}
|
||||
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_prefix_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
|
||||
}
|
||||
@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
self.words_prefixes_fst.contains(word)
|
||||
}
|
||||
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
fn docid_words_positions(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
let mut words_positions = HashMap::new();
|
||||
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
|
||||
let ((_, word), positions) = result?;
|
||||
@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
level: TreeLevel,
|
||||
in_prefix_cache: bool,
|
||||
left: Option<u32>,
|
||||
right: Option<u32>
|
||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
|
||||
{
|
||||
right: Option<u32>,
|
||||
) -> heed::Result<
|
||||
Box<
|
||||
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
|
||||
>,
|
||||
> {
|
||||
let range = {
|
||||
let left = left.unwrap_or(u32::min_value());
|
||||
let right = right.unwrap_or(u32::max_value());
|
||||
@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
Ok(Box::new(db.range(self.rtxn, &range)?))
|
||||
}
|
||||
|
||||
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
|
||||
fn word_position_last_level(
|
||||
&self,
|
||||
word: &str,
|
||||
in_prefix_cache: bool,
|
||||
) -> heed::Result<Option<TreeLevel>> {
|
||||
let range = {
|
||||
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
|
||||
@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
};
|
||||
let last_level = db
|
||||
.remap_data_type::<heed::types::DecodeIgnore>()
|
||||
.range(self.rtxn, &range)?.last().transpose()?
|
||||
.range(self.rtxn, &range)?
|
||||
.last()
|
||||
.transpose()?
|
||||
.map(|((_, level, _, _), _)| level);
|
||||
|
||||
Ok(last_level)
|
||||
@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
}
|
||||
}
|
||||
|
||||
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn field_id_word_count_docids(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
word_count: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (field_id, word_count);
|
||||
self.index.field_id_word_count_docids.get(self.rtxn, &key)
|
||||
}
|
||||
|
||||
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_level_position_docids(
|
||||
&self,
|
||||
word: &str,
|
||||
level: TreeLevel,
|
||||
left: u32,
|
||||
right: u32,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (word, level, left, right);
|
||||
self.index.word_level_position_docids.get(self.rtxn, &key)
|
||||
}
|
||||
@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> {
|
||||
query_tree: Option<Operation>,
|
||||
primitive_query: Option<Vec<PrimitiveQueryPart>>,
|
||||
filtered_candidates: Option<RoaringBitmap>,
|
||||
) -> Result<Final<'t>>
|
||||
{
|
||||
) -> Result<Final<'t>> {
|
||||
use crate::criterion::Criterion as Name;
|
||||
|
||||
let primitive_query = primitive_query.unwrap_or_default();
|
||||
|
||||
let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
|
||||
let mut criterion =
|
||||
Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
|
||||
for name in self.index.criteria(&self.rtxn)? {
|
||||
criterion = match name {
|
||||
Name::Typo => Box::new(Typo::new(self, criterion)),
|
||||
@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> {
|
||||
Name::Proximity => Box::new(Proximity::new(self, criterion)),
|
||||
Name::Attribute => Box::new(Attribute::new(self, criterion)),
|
||||
Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?),
|
||||
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?),
|
||||
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?),
|
||||
Name::Asc(field) => {
|
||||
Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?)
|
||||
}
|
||||
Name::Desc(field) => {
|
||||
Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?)
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
) -> Result<RoaringBitmap> {
|
||||
use Operation::{And, Or, Phrase, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
let mut ops = ops.iter().map(|op| {
|
||||
resolve_operation(ctx, op, wdcache)
|
||||
}).collect::<Result<Vec<_>>>()?;
|
||||
let mut ops = ops
|
||||
.iter()
|
||||
.map(|op| resolve_operation(ctx, op, wdcache))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
ops.sort_unstable_by_key(|cds| cds.len());
|
||||
|
||||
@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>(
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
}
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>(
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
None => return Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for op in ops {
|
||||
@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>(
|
||||
candidates.union_with(&docids);
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
}
|
||||
Query(q) => Ok(query_docids(ctx, q, wdcache)?),
|
||||
}
|
||||
}
|
||||
@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>(
|
||||
resolve_operation(ctx, query_tree, wdcache)
|
||||
}
|
||||
|
||||
|
||||
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||
ctx: &dyn Context,
|
||||
left_words: &[(T, u8)],
|
||||
right_words: &[(U, u8)],
|
||||
proximity: u8
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
proximity: u8,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _l_typo) in left_words {
|
||||
for (right, _r_typo) in right_words {
|
||||
let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
|
||||
let current_docids = ctx
|
||||
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?
|
||||
.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
}
|
||||
@ -319,8 +391,7 @@ fn query_docids(
|
||||
ctx: &dyn Context,
|
||||
query: &Query,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
match &query.kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if query.prefix && ctx.in_prefix_cache(&word) {
|
||||
@ -336,7 +407,7 @@ fn query_docids(
|
||||
} else {
|
||||
Ok(ctx.word_docids(&word)?.unwrap_or_default())
|
||||
}
|
||||
},
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
@ -345,7 +416,7 @@ fn query_docids(
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
Ok(docids)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -355,8 +426,7 @@ fn query_pair_proximity_docids(
|
||||
right: &Query,
|
||||
proximity: u8,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
if proximity >= 8 {
|
||||
let mut candidates = query_docids(ctx, left, wdcache)?;
|
||||
let right_candidates = query_docids(ctx, right, wdcache)?;
|
||||
@ -368,20 +438,31 @@ fn query_pair_proximity_docids(
|
||||
match (&left.kind, &right.kind) {
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
|
||||
Ok(ctx
|
||||
.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
|
||||
.unwrap_or_default())
|
||||
} else if prefix {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
} else {
|
||||
Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
|
||||
Ok(ctx
|
||||
.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
},
|
||||
}
|
||||
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
|
||||
let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
let l_words =
|
||||
word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _) in l_words {
|
||||
let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
|
||||
let current_docids = ctx
|
||||
.word_prefix_pair_proximity_docids(
|
||||
left.as_ref(),
|
||||
right.as_ref(),
|
||||
proximity,
|
||||
)?
|
||||
.unwrap_or_default();
|
||||
docids.union_with(¤t_docids);
|
||||
}
|
||||
Ok(docids)
|
||||
@ -391,28 +472,36 @@ fn query_pair_proximity_docids(
|
||||
} else {
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||
}
|
||||
},
|
||||
}
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
|
||||
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
},
|
||||
(QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => {
|
||||
let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
}
|
||||
(
|
||||
QueryKind::Tolerant { typo: l_typo, word: left },
|
||||
QueryKind::Tolerant { typo: r_typo, word: right },
|
||||
) => {
|
||||
let l_words =
|
||||
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
use maplit::hashmap;
|
||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn s(s: &str) -> String { s.to_string() }
|
||||
use maplit::hashmap;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn s(s: &str) -> String {
|
||||
s.to_string()
|
||||
}
|
||||
pub struct TestContext<'t> {
|
||||
words_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
word_docids: HashMap<String, RoaringBitmap>,
|
||||
@ -435,12 +524,22 @@ pub mod test {
|
||||
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
|
||||
}
|
||||
|
||||
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
||||
Ok(self.word_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_prefix_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
||||
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
@ -453,24 +552,44 @@ pub mod test {
|
||||
self.word_prefix_docids.contains_key(&word.to_string())
|
||||
}
|
||||
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
fn docid_words_positions(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
if let Some(docid_words) = self.docid_words.get(&docid) {
|
||||
Ok(docid_words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
|
||||
.collect()
|
||||
)
|
||||
.map(|(i, w)| {
|
||||
(w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))
|
||||
})
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
|
||||
fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
|
||||
fn word_position_iterator(
|
||||
&self,
|
||||
_word: &str,
|
||||
_level: TreeLevel,
|
||||
_in_prefix_cache: bool,
|
||||
_left: Option<u32>,
|
||||
_right: Option<u32>,
|
||||
) -> heed::Result<
|
||||
Box<
|
||||
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>>
|
||||
+ 'c,
|
||||
>,
|
||||
> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
|
||||
fn word_position_last_level(
|
||||
&self,
|
||||
_word: &str,
|
||||
_in_prefix_cache: bool,
|
||||
) -> heed::Result<Option<TreeLevel>> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@ -478,15 +597,25 @@ pub mod test {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>> {
|
||||
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn word_level_position_docids(
|
||||
&self,
|
||||
_word: &str,
|
||||
_level: TreeLevel,
|
||||
_left: u32,
|
||||
_right: u32,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn field_id_word_count_docids(
|
||||
&self,
|
||||
_field_id: FieldId,
|
||||
_word_count: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@ -506,7 +635,7 @@ pub mod test {
|
||||
RoaringBitmap::from_sorted_iter(values.into_iter())
|
||||
}
|
||||
|
||||
let word_docids = hashmap!{
|
||||
let word_docids = hashmap! {
|
||||
s("hello") => random_postings(rng, 1500),
|
||||
s("hi") => random_postings(rng, 4000),
|
||||
s("word") => random_postings(rng, 2500),
|
||||
@ -530,7 +659,7 @@ pub mod test {
|
||||
}
|
||||
}
|
||||
|
||||
let word_prefix_docids = hashmap!{
|
||||
let word_prefix_docids = hashmap! {
|
||||
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
|
||||
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
|
||||
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
|
||||
@ -540,7 +669,9 @@ pub mod test {
|
||||
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
||||
for (lword, lcandidates) in &word_docids {
|
||||
for (rword, rcandidates) in &word_docids {
|
||||
if lword == rword { continue }
|
||||
if lword == rword {
|
||||
continue;
|
||||
}
|
||||
let candidates = lcandidates & rcandidates;
|
||||
for candidate in candidates {
|
||||
if let Some(docid_words) = docid_words.get(&candidate) {
|
||||
@ -551,24 +682,31 @@ pub mod test {
|
||||
} else {
|
||||
(s(lword), s(rword), (lposition - rposition + 1) as i32)
|
||||
};
|
||||
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
|
||||
let docids = word_pair_proximity_docids
|
||||
.entry(key)
|
||||
.or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (pword, pcandidates) in &word_prefix_docids {
|
||||
if lword.starts_with(pword) { continue }
|
||||
if lword.starts_with(pword) {
|
||||
continue;
|
||||
}
|
||||
let candidates = lcandidates & pcandidates;
|
||||
for candidate in candidates {
|
||||
if let Some(docid_words) = docid_words.get(&candidate) {
|
||||
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
||||
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
||||
let rposition =
|
||||
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
||||
let key = if lposition < rposition {
|
||||
(s(lword), s(pword), (rposition - lposition) as i32)
|
||||
} else {
|
||||
(s(lword), s(pword), (lposition - rposition + 1) as i32)
|
||||
};
|
||||
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
|
||||
let docids = word_prefix_pair_proximity_docids
|
||||
.entry(key)
|
||||
.or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
}
|
||||
}
|
||||
|
@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap};
|
||||
use std::collections::hash_map::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
||||
use crate::search::{build_dfa, WordDerivationsCache};
|
||||
use crate::search::{query_tree::QueryKind};
|
||||
use crate::{DocumentId, Position, Result};
|
||||
use super::{
|
||||
Context,
|
||||
Criterion,
|
||||
CriterionParameters,
|
||||
CriterionResult,
|
||||
query_docids,
|
||||
query_pair_proximity_docids,
|
||||
resolve_query_tree,
|
||||
query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion,
|
||||
CriterionParameters, CriterionResult,
|
||||
};
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
|
||||
use crate::search::{build_dfa, WordDerivationsCache};
|
||||
use crate::{DocumentId, Position, Result};
|
||||
|
||||
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
|
||||
|
||||
@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
}
|
||||
|
||||
loop {
|
||||
debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
|
||||
debug!(
|
||||
"Proximity at iteration {} (max prox {:?}) ({:?})",
|
||||
self.proximity,
|
||||
self.state.as_ref().map(|(mp, _, _)| mp),
|
||||
self.state.as_ref().map(|(_, _, cd)| cd),
|
||||
);
|
||||
|
||||
match &mut self.state {
|
||||
Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => {
|
||||
Some((max_prox, _, allowed_candidates))
|
||||
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
|
||||
{
|
||||
self.state = None; // reset state
|
||||
},
|
||||
}
|
||||
Some((_, query_tree, allowed_candidates)) => {
|
||||
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD {
|
||||
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD
|
||||
&& self.proximity > PROXIMITY_THRESHOLD
|
||||
{
|
||||
if let Some(cache) = self.plane_sweep_cache.as_mut() {
|
||||
match cache.next() {
|
||||
Some((p, candidates)) => {
|
||||
self.proximity = p;
|
||||
candidates
|
||||
},
|
||||
}
|
||||
None => {
|
||||
self.state = None; // reset state
|
||||
continue
|
||||
},
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let cache = resolve_plane_sweep_candidates(
|
||||
@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
)?;
|
||||
self.plane_sweep_cache = Some(cache.into_iter());
|
||||
|
||||
continue
|
||||
continue;
|
||||
}
|
||||
} else { // use set theory based algorithm
|
||||
} else {
|
||||
// use set theory based algorithm
|
||||
resolve_candidates(
|
||||
self.ctx,
|
||||
&query_tree,
|
||||
@ -117,39 +117,50 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
let maximum_proximity = maximum_proximity(&query_tree);
|
||||
self.state = Some((maximum_proximity as u8, query_tree, candidates));
|
||||
self.proximity = 0;
|
||||
self.plane_sweep_cache = None;
|
||||
},
|
||||
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => return Ok(None),
|
||||
match bucket_candidates {
|
||||
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
||||
None => self.bucket_candidates |= &candidates,
|
||||
}
|
||||
|
||||
let maximum_proximity = maximum_proximity(&query_tree);
|
||||
self.state = Some((maximum_proximity as u8, query_tree, candidates));
|
||||
self.proximity = 0;
|
||||
self.plane_sweep_cache = None;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -162,46 +173,48 @@ fn resolve_candidates<'t>(
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
use Operation::{And, Phrase, Or};
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
||||
Phrase(words) => if proximity == 0 {
|
||||
let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let mut candidates = None;
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
match candidates.as_mut() {
|
||||
Phrase(words) => {
|
||||
if proximity == 0 {
|
||||
let most_left = words
|
||||
.first()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let most_right = words
|
||||
.last()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let mut candidates = None;
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => match candidates.as_mut() {
|
||||
Some(candidates) => *candidates &= pair_docids,
|
||||
None => candidates = Some(pair_docids),
|
||||
},
|
||||
None => {
|
||||
candidates = None;
|
||||
break;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
candidates = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
match (most_left, most_right, candidates) {
|
||||
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
|
||||
_otherwise => Default::default(),
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
}
|
||||
match (most_left, most_right, candidates) {
|
||||
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
|
||||
_otherwise => Default::default(),
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut output = Vec::new();
|
||||
for op in ops {
|
||||
@ -209,13 +222,15 @@ fn resolve_candidates<'t>(
|
||||
output.extend(result);
|
||||
}
|
||||
output
|
||||
},
|
||||
Operation::Query(q) => if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q, wdcache)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
}
|
||||
Operation::Query(q) => {
|
||||
if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q, wdcache)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
@ -228,8 +243,7 @@ fn resolve_candidates<'t>(
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
|
||||
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
|
||||
}
|
||||
@ -257,7 +271,8 @@ fn resolve_candidates<'t>(
|
||||
|
||||
for (ll, lr, lcandidates) in lefts {
|
||||
for (rl, rr, rcandidates) in rights {
|
||||
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
|
||||
let mut candidates =
|
||||
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
|
||||
if lcandidates.len() < rcandidates.len() {
|
||||
candidates.intersect_with(lcandidates);
|
||||
candidates.intersect_with(rcandidates);
|
||||
@ -282,22 +297,26 @@ fn resolve_candidates<'t>(
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
// Extract the first two elements but gives the tail
|
||||
// that is just after the first element.
|
||||
let next = branches.split_first().map(|(h1, t)| {
|
||||
(h1, t.split_first().map(|(h2, _)| (h2, t)))
|
||||
});
|
||||
let next =
|
||||
branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
|
||||
|
||||
match next {
|
||||
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache),
|
||||
Some((head1, Some((head2, [_])))) => {
|
||||
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
|
||||
}
|
||||
Some((head1, Some((head2, tail)))) => {
|
||||
let mut output = Vec::new();
|
||||
for p in 0..=proximity {
|
||||
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? {
|
||||
for (lhead, _, head_candidates) in
|
||||
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
|
||||
{
|
||||
if !head_candidates.is_empty() {
|
||||
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? {
|
||||
for (_, rtail, mut candidates) in
|
||||
mdfs(ctx, tail, proximity - p, cache, wdcache)?
|
||||
{
|
||||
candidates.intersect_with(&head_candidates);
|
||||
if !candidates.is_empty() {
|
||||
output.push((lhead.clone(), rtail, candidates));
|
||||
@ -307,7 +326,7 @@ fn resolve_candidates<'t>(
|
||||
}
|
||||
}
|
||||
Ok(output)
|
||||
},
|
||||
}
|
||||
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
|
||||
None => Ok(Default::default()),
|
||||
}
|
||||
@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates(
|
||||
query_tree: &Operation,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<BTreeMap<u8, RoaringBitmap>>
|
||||
{
|
||||
) -> Result<BTreeMap<u8, RoaringBitmap>> {
|
||||
/// FIXME may be buggy with query like "new new york"
|
||||
fn plane_sweep(
|
||||
groups_positions: Vec<Vec<(Position, u8, Position)>>,
|
||||
consecutive: bool,
|
||||
) -> Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
) -> Result<Vec<(Position, u8, Position)>> {
|
||||
fn compute_groups_proximity(
|
||||
groups: &[(usize, (Position, u8, Position))],
|
||||
consecutive: bool,
|
||||
) -> Option<(Position, u8, Position)>
|
||||
{
|
||||
) -> Option<(Position, u8, Position)> {
|
||||
// take the inner proximity of the first group as initial
|
||||
let (_, (_, mut proximity, _)) = groups.first()?;
|
||||
let (_, (left_most_pos, _, _)) = groups.first()?;
|
||||
let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
|
||||
let (_, (_, _, right_most_pos)) =
|
||||
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
|
||||
|
||||
for pair in groups.windows(2) {
|
||||
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
|
||||
// if two positions are equal, meaning that they share at least a word, we return None
|
||||
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
|
||||
return None
|
||||
return None;
|
||||
}
|
||||
|
||||
let pair_proximity = {
|
||||
// if intervals are disjoint [..].(..)
|
||||
if lpos2 > rpos1 { lpos2 - rpos1 }
|
||||
if lpos2 > rpos1 {
|
||||
lpos2 - rpos1
|
||||
}
|
||||
// if the second interval is a subset of the first [.(..).]
|
||||
else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) }
|
||||
else if rpos2 < rpos1 {
|
||||
(lpos2 - lpos1).min(rpos1 - rpos2)
|
||||
}
|
||||
// if intervals overlaps [.(..].)
|
||||
else { (lpos2 - lpos1).min(rpos2 - rpos1) }
|
||||
else {
|
||||
(lpos2 - lpos1).min(rpos2 - rpos1)
|
||||
}
|
||||
};
|
||||
|
||||
// if groups are in the good order (query order) we remove 1 to the proximity
|
||||
// the proximity is clamped to 7
|
||||
let pair_proximity = if i1 < i2 {
|
||||
(pair_proximity - 1).min(7)
|
||||
} else {
|
||||
pair_proximity.min(7)
|
||||
};
|
||||
let pair_proximity =
|
||||
if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
|
||||
|
||||
proximity += pair_proximity as u8 + prox2;
|
||||
}
|
||||
@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates(
|
||||
|
||||
let groups_len = groups_positions.len();
|
||||
|
||||
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
||||
let mut groups_positions: Vec<_> =
|
||||
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
||||
|
||||
// Pop top elements of each list.
|
||||
let mut current = Vec::with_capacity(groups_len);
|
||||
@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates(
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
use Operation::{And, Phrase, Or};
|
||||
) -> Result<Vec<(Position, u8, Position)>> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
if let Some(result) = rocache.get(query_tree) {
|
||||
return Ok(result.clone());
|
||||
@ -462,13 +482,20 @@ fn resolve_plane_sweep_candidates(
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => {
|
||||
let mut groups_positions = Vec::with_capacity(ops.len());
|
||||
let mut groups_positions = Vec::with_capacity(ops.len());
|
||||
for operation in ops {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
let positions = resolve_operation(
|
||||
ctx,
|
||||
operation,
|
||||
docid,
|
||||
rocache,
|
||||
words_positions,
|
||||
wdcache,
|
||||
)?;
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, false)?
|
||||
},
|
||||
}
|
||||
Phrase(words) => {
|
||||
let mut groups_positions = Vec::with_capacity(words.len());
|
||||
for word in words {
|
||||
@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates(
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, true)?
|
||||
},
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut result = Vec::new();
|
||||
for op in ops {
|
||||
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
|
||||
result.extend(resolve_operation(
|
||||
ctx,
|
||||
op,
|
||||
docid,
|
||||
rocache,
|
||||
words_positions,
|
||||
wdcache,
|
||||
)?)
|
||||
}
|
||||
|
||||
result.sort_unstable();
|
||||
result
|
||||
},
|
||||
}
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let mut result = Vec::new();
|
||||
match kind {
|
||||
@ -498,9 +532,9 @@ fn resolve_plane_sweep_candidates(
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(iter);
|
||||
} else if let Some(positions) = words_positions.get(word) {
|
||||
result.extend(positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(positions.iter().map(|p| (p, 0, p)));
|
||||
}
|
||||
},
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
let iter = word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates(
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||
{
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap> {
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
@ -539,7 +572,7 @@ fn resolve_plane_sweep_candidates(
|
||||
for docid in allowed_candidates {
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
resolve_operation_cache.clear();
|
||||
let positions = resolve_operation(
|
||||
let positions = resolve_operation(
|
||||
ctx,
|
||||
query_tree,
|
||||
docid,
|
||||
|
@ -1,20 +1,17 @@
|
||||
use std::{borrow::Cow, collections::HashMap, mem::take};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{
|
||||
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
|
||||
CriterionResult,
|
||||
};
|
||||
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use crate::Result;
|
||||
use super::{
|
||||
Candidates,
|
||||
Context,
|
||||
Criterion,
|
||||
CriterionParameters,
|
||||
CriterionResult,
|
||||
query_docids,
|
||||
resolve_query_tree,
|
||||
};
|
||||
|
||||
/// Maximum number of typo for a word of any length.
|
||||
const MAX_TYPOS_PER_WORD: u8 = 2;
|
||||
@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> {
|
||||
}
|
||||
|
||||
loop {
|
||||
debug!("Typo at iteration {} (max typos {:?}) ({:?})",
|
||||
debug!(
|
||||
"Typo at iteration {} (max typos {:?}) ({:?})",
|
||||
self.typos,
|
||||
self.state.as_ref().map(|(mt, _, _)| mt),
|
||||
self.state.as_ref().map(|(_, _, cd)| cd),
|
||||
@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> {
|
||||
match self.state.as_mut() {
|
||||
Some((max_typos, _, _)) if self.typos > *max_typos => {
|
||||
self.state = None; // reset state
|
||||
},
|
||||
}
|
||||
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
|
||||
self.state = None; // reset state
|
||||
},
|
||||
}
|
||||
Some((_, query_tree, candidates_authorization)) => {
|
||||
let fst = self.ctx.words_fst();
|
||||
let new_query_tree = match self.typos {
|
||||
typos if typos < MAX_TYPOS_PER_WORD => {
|
||||
alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?
|
||||
},
|
||||
typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree(
|
||||
&fst,
|
||||
query_tree.clone(),
|
||||
self.typos,
|
||||
params.wdcache,
|
||||
)?,
|
||||
MAX_TYPOS_PER_WORD => {
|
||||
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
|
||||
// we keep the altered query tree
|
||||
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?;
|
||||
*query_tree = alterate_query_tree(
|
||||
&fst,
|
||||
query_tree.clone(),
|
||||
self.typos,
|
||||
params.wdcache,
|
||||
)?;
|
||||
// we compute the allowed candidates
|
||||
let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
|
||||
let query_tree_allowed_candidates =
|
||||
resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
|
||||
// we assign the allowed candidates to the candidates authorization.
|
||||
*candidates_authorization = match take(candidates_authorization) {
|
||||
Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates),
|
||||
Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates),
|
||||
Allowed(allowed_candidates) => {
|
||||
Allowed(query_tree_allowed_candidates & allowed_candidates)
|
||||
}
|
||||
Forbidden(forbidden_candidates) => {
|
||||
Allowed(query_tree_allowed_candidates - forbidden_candidates)
|
||||
}
|
||||
};
|
||||
query_tree.clone()
|
||||
},
|
||||
}
|
||||
_otherwise => query_tree.clone(),
|
||||
};
|
||||
|
||||
@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> {
|
||||
Allowed(allowed_candidates) => {
|
||||
candidates &= &*allowed_candidates;
|
||||
*allowed_candidates -= &candidates;
|
||||
},
|
||||
}
|
||||
Forbidden(forbidden_candidates) => {
|
||||
candidates -= &*forbidden_candidates;
|
||||
*forbidden_candidates |= &candidates;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let bucket_candidates = match self.bucket_candidates.as_mut() {
|
||||
@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> {
|
||||
filtered_candidates: None,
|
||||
bucket_candidates: Some(bucket_candidates),
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
|
||||
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
self.bucket_candidates =
|
||||
match (self.bucket_candidates.take(), bucket_candidates) {
|
||||
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
|
||||
(self_bc, parent_bc) => self_bc.or(parent_bc),
|
||||
};
|
||||
|
||||
let candidates = match candidates.or(filtered_candidates) {
|
||||
Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates),
|
||||
None => Candidates::Forbidden(params.excluded_candidates.clone()),
|
||||
};
|
||||
let candidates = match candidates.or(filtered_candidates) {
|
||||
Some(candidates) => {
|
||||
Candidates::Allowed(candidates - params.excluded_candidates)
|
||||
}
|
||||
None => Candidates::Forbidden(params.excluded_candidates.clone()),
|
||||
};
|
||||
|
||||
let maximum_typos = maximum_typo(&query_tree) as u8;
|
||||
self.state = Some((maximum_typos, query_tree, candidates));
|
||||
self.typos = 0;
|
||||
|
||||
},
|
||||
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => return Ok(None),
|
||||
let maximum_typos = maximum_typo(&query_tree) as u8;
|
||||
self.state = Some((maximum_typos, query_tree, candidates));
|
||||
self.typos = 0;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -164,21 +185,19 @@ fn alterate_query_tree(
|
||||
mut query_tree: Operation,
|
||||
number_typos: u8,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Operation>
|
||||
{
|
||||
) -> Result<Operation> {
|
||||
fn recurse(
|
||||
words_fst: &fst::Set<Cow<[u8]>>,
|
||||
operation: &mut Operation,
|
||||
number_typos: u8,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<()>
|
||||
{
|
||||
use Operation::{And, Phrase, Or};
|
||||
) -> Result<()> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
match operation {
|
||||
And(ops) | Or(_, ops) => {
|
||||
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
|
||||
},
|
||||
}
|
||||
// Because Phrases don't allow typos, no alteration can be done.
|
||||
Phrase(_words) => return Ok(()),
|
||||
Operation::Query(q) => {
|
||||
@ -193,19 +212,25 @@ fn alterate_query_tree(
|
||||
} else {
|
||||
let typo = *typo.min(&number_typos);
|
||||
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
|
||||
let queries = words.iter().map(|(word, typo)| {
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() },
|
||||
let queries = words
|
||||
.iter()
|
||||
.map(|(word, typo)| {
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::Exact {
|
||||
original_typo: *typo,
|
||||
word: word.to_string(),
|
||||
},
|
||||
})
|
||||
})
|
||||
}).collect();
|
||||
.collect();
|
||||
|
||||
*operation = Operation::or(false, queries);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -219,22 +244,18 @@ fn resolve_candidates<'t>(
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
) -> Result<RoaringBitmap> {
|
||||
use Operation::{And, Or, Phrase, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
mdfs(ctx, ops, number_typos, cache, wdcache)
|
||||
},
|
||||
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
@ -250,12 +271,12 @@ fn resolve_candidates<'t>(
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
None => return Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for op in ops {
|
||||
@ -263,12 +284,14 @@ fn resolve_candidates<'t>(
|
||||
candidates.union_with(&docids);
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Query(q) => if q.kind.typo() == number_typos {
|
||||
Ok(query_docids(ctx, q, wdcache)?)
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
},
|
||||
}
|
||||
Query(q) => {
|
||||
if q.kind.typo() == number_typos {
|
||||
Ok(query_docids(ctx, q, wdcache)?)
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -278,8 +301,7 @@ fn resolve_candidates<'t>(
|
||||
mana: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
match branches.split_first() {
|
||||
Some((head, [])) => {
|
||||
let cache_key = (head.clone(), mana);
|
||||
@ -290,7 +312,7 @@ fn resolve_candidates<'t>(
|
||||
cache.insert(cache_key, candidates.clone());
|
||||
Ok(candidates)
|
||||
}
|
||||
},
|
||||
}
|
||||
Some((head, tail)) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
|
||||
@ -313,7 +335,7 @@ fn resolve_candidates<'t>(
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
},
|
||||
}
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
@ -323,9 +345,9 @@ fn resolve_candidates<'t>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use super::super::initial::Initial;
|
||||
use super::super::test::TestContext;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn initial_placeholder_no_facets() {
|
||||
@ -348,13 +370,23 @@ mod test {
|
||||
#[test]
|
||||
fn initial_query_tree_no_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
|
||||
])
|
||||
]);
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||
}),
|
||||
])],
|
||||
);
|
||||
|
||||
let facet_candidates = None;
|
||||
|
||||
@ -369,13 +401,23 @@ mod test {
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("world").unwrap().unwrap();
|
||||
let expected_1 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
])),
|
||||
query_tree: Some(Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("world".to_string()),
|
||||
}),
|
||||
])],
|
||||
)),
|
||||
candidates: Some(candidates_1.clone()),
|
||||
bucket_candidates: Some(candidates_1),
|
||||
filtered_candidates: None,
|
||||
@ -383,22 +425,37 @@ mod test {
|
||||
|
||||
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
|
||||
|
||||
let candidates_2 = (
|
||||
context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap()
|
||||
) - context.word_docids("world").unwrap().unwrap();
|
||||
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap())
|
||||
- context.word_docids("world").unwrap().unwrap();
|
||||
let expected_2 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
]),
|
||||
])),
|
||||
query_tree: Some(Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact_with_typo(1, "word".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("world".to_string()),
|
||||
}),
|
||||
],
|
||||
),
|
||||
])],
|
||||
)),
|
||||
candidates: Some(candidates_2.clone()),
|
||||
bucket_candidates: Some(candidates_2),
|
||||
filtered_candidates: None,
|
||||
@ -437,17 +494,26 @@ mod test {
|
||||
#[test]
|
||||
fn initial_query_tree_with_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
|
||||
])
|
||||
]);
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||
}),
|
||||
])],
|
||||
);
|
||||
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
|
||||
let mut criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut WordDerivationsCache::new(),
|
||||
excluded_candidates: &RoaringBitmap::new(),
|
||||
@ -459,13 +525,23 @@ mod test {
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("world").unwrap().unwrap();
|
||||
let expected_1 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
])),
|
||||
query_tree: Some(Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("world".to_string()),
|
||||
}),
|
||||
])],
|
||||
)),
|
||||
candidates: Some(&candidates_1 & &facet_candidates),
|
||||
bucket_candidates: Some(&candidates_1 & &facet_candidates),
|
||||
filtered_candidates: None,
|
||||
@ -473,22 +549,37 @@ mod test {
|
||||
|
||||
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
|
||||
|
||||
let candidates_2 = (
|
||||
context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap()
|
||||
) - context.word_docids("world").unwrap().unwrap();
|
||||
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
& context.word_docids("word").unwrap().unwrap())
|
||||
- context.word_docids("world").unwrap().unwrap();
|
||||
let expected_2 = CriterionResult {
|
||||
query_tree: Some(Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
|
||||
]),
|
||||
]),
|
||||
])),
|
||||
query_tree: Some(Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact_with_typo(1, "word".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("world".to_string()),
|
||||
}),
|
||||
],
|
||||
),
|
||||
])],
|
||||
)),
|
||||
candidates: Some(&candidates_2 & &facet_candidates),
|
||||
bucket_candidates: Some(&candidates_2 & &facet_candidates),
|
||||
filtered_candidates: None,
|
||||
|
@ -3,9 +3,9 @@ use std::mem::take;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::Result;
|
||||
use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree};
|
||||
|
||||
pub struct Words<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> {
|
||||
Some(query_tree) => {
|
||||
let candidates = match self.candidates.as_mut() {
|
||||
Some(allowed_candidates) => {
|
||||
let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
|
||||
let mut candidates =
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
|
||||
candidates &= &*allowed_candidates;
|
||||
*allowed_candidates -= &candidates;
|
||||
Some(candidates)
|
||||
},
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> {
|
||||
filtered_candidates: self.filtered_candidates.clone(),
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => {
|
||||
match self.parent.next(params)? {
|
||||
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
|
||||
self.query_trees = explode_query_tree(query_tree);
|
||||
self.candidates = candidates;
|
||||
self.filtered_candidates = filtered_candidates;
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
self.query_trees = explode_query_tree(query_tree);
|
||||
self.candidates = candidates;
|
||||
self.filtered_candidates = filtered_candidates;
|
||||
|
||||
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
|
||||
self.bucket_candidates =
|
||||
match (self.bucket_candidates.take(), bucket_candidates) {
|
||||
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
|
||||
(self_bc, parent_bc) => self_bc.or(parent_bc),
|
||||
};
|
||||
},
|
||||
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -3,11 +3,11 @@ use std::mem::size_of;
|
||||
use heed::types::ByteSlice;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Distinct, DocIter};
|
||||
use crate::error::InternalError;
|
||||
use crate::heed_codec::facet::*;
|
||||
use crate::index::db_name;
|
||||
use crate::{DocumentId, FieldId, Index, Result};
|
||||
use super::{Distinct, DocIter};
|
||||
|
||||
const FID_SIZE: usize = size_of::<FieldId>();
|
||||
const DOCID_SIZE: usize = size_of::<DocumentId>();
|
||||
@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> {
|
||||
|
||||
impl<'a> FacetDistinct<'a> {
|
||||
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
|
||||
Self {
|
||||
distinct,
|
||||
index,
|
||||
txn,
|
||||
}
|
||||
Self { distinct, index, txn }
|
||||
}
|
||||
}
|
||||
|
||||
@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> {
|
||||
|
||||
impl<'a> FacetDistinctIter<'a> {
|
||||
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index
|
||||
.facet_id_string_docids
|
||||
.get(self.txn, &(self.distinct, key))
|
||||
self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
|
||||
}
|
||||
|
||||
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
|
||||
// get facet docids on level 0
|
||||
self.index
|
||||
.facet_id_f64_docids
|
||||
.get(self.txn, &(self.distinct, 0, key, key))
|
||||
self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key))
|
||||
}
|
||||
|
||||
fn distinct_string(&mut self, id: DocumentId) -> Result<()> {
|
||||
@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> {
|
||||
|
||||
for item in iter {
|
||||
let ((_, _, value), _) = item?;
|
||||
let facet_docids = self
|
||||
.facet_string_docids(value)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
let facet_docids =
|
||||
self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::FACET_ID_STRING_DOCIDS,
|
||||
key: None,
|
||||
})?;
|
||||
@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> {
|
||||
|
||||
for item in iter {
|
||||
let ((_, _, value), _) = item?;
|
||||
let facet_docids = self
|
||||
.facet_number_docids(value)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
let facet_docids =
|
||||
self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::FACET_ID_F64_DOCIDS,
|
||||
key: None,
|
||||
})?;
|
||||
|
@ -1,11 +1,11 @@
|
||||
mod facet_distinct;
|
||||
mod noop_distinct;
|
||||
|
||||
pub use facet_distinct::FacetDistinct;
|
||||
pub use noop_distinct::NoopDistinct;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{DocumentId, Result};
|
||||
pub use facet_distinct::FacetDistinct;
|
||||
pub use noop_distinct::NoopDistinct;
|
||||
|
||||
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
|
||||
/// It provides a way to get back the ownership to the excluded set.
|
||||
@ -29,13 +29,15 @@ mod test {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::{seq::SliceRandom, Rng};
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::index::{Index, tests::TempIndex};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::Index;
|
||||
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
|
||||
use crate::{BEU32, FieldId, DocumentId};
|
||||
use crate::{DocumentId, FieldId, BEU32};
|
||||
|
||||
static JSON: Lazy<Value> = Lazy::new(generate_json);
|
||||
|
||||
@ -89,9 +91,7 @@ mod test {
|
||||
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
addition.update_format(UpdateFormat::Json);
|
||||
|
||||
addition
|
||||
.execute(JSON.to_string().as_bytes(), |_, _| ())
|
||||
.unwrap();
|
||||
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
|
||||
|
||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||
let fid = fields_map.id(&distinct).unwrap();
|
||||
@ -103,13 +103,12 @@ mod test {
|
||||
(index, fid, map)
|
||||
}
|
||||
|
||||
|
||||
/// Checks that all the candidates are distinct, and returns the candidates number.
|
||||
pub(crate) fn validate_distinct_candidates(
|
||||
candidates: impl Iterator<Item = crate::Result<DocumentId>>,
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
) -> usize {
|
||||
fn test(seen: &mut HashSet<String>, value: &Value) {
|
||||
match value {
|
||||
Value::Null | Value::Object(_) | Value::Bool(_) => (),
|
||||
@ -117,7 +116,7 @@ mod test {
|
||||
let s = value.to_string();
|
||||
assert!(seen.insert(s));
|
||||
}
|
||||
Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))}
|
||||
Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
use roaring::{RoaringBitmap, bitmap::IntoIter};
|
||||
use roaring::bitmap::IntoIter;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Distinct, DocIter};
|
||||
use crate::{DocumentId, Result};
|
||||
use super::{DocIter, Distinct};
|
||||
|
||||
/// A distinct implementer that does not perform any distinct,
|
||||
/// and simply returns an iterator to the candidates.
|
||||
@ -30,10 +31,7 @@ impl Distinct for NoopDistinct {
|
||||
type Iter = NoopDistinctIter;
|
||||
|
||||
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||
NoopDistinctIter {
|
||||
candidates: candidates.into_iter(),
|
||||
excluded,
|
||||
}
|
||||
NoopDistinctIter { candidates: candidates.into_iter(), excluded }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
use std::collections::{HashSet, BTreeMap};
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::ops::Bound::Unbounded;
|
||||
use std::{cmp, fmt};
|
||||
|
||||
use heed::{Database, BytesDecode};
|
||||
use heed::types::{ByteSlice, Unit};
|
||||
use heed::{BytesDecode, Database};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::FacetValueStringCodec;
|
||||
use crate::search::facet::{FacetIter, FacetRange};
|
||||
use crate::{Index, FieldId, DocumentId, Result};
|
||||
use crate::{DocumentId, FieldId, Index, Result};
|
||||
|
||||
/// The default number of values by facets that will
|
||||
/// be fetched from the key-value store.
|
||||
@ -43,7 +43,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
|
||||
pub fn facets<I: IntoIterator<Item = A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
|
||||
self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect());
|
||||
self
|
||||
}
|
||||
@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
facet_type: FacetType,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
fn fetch_facet_values<'t, KC, K: 't>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<KC, Unit>,
|
||||
@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
FacetType::Number => {
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
|
||||
},
|
||||
}
|
||||
FacetType::String => {
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
|
||||
@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> {
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
let iter = FacetIter::new_non_reducing(
|
||||
self.rtxn, self.index, field_id, candidates.clone(),
|
||||
)?;
|
||||
) -> heed::Result<()> {
|
||||
let iter =
|
||||
FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
||||
|
||||
for result in iter {
|
||||
let (value, mut docids) = result?;
|
||||
@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
fn facet_values_from_raw_facet_database(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<BTreeMap<String, u64>>
|
||||
{
|
||||
) -> heed::Result<BTreeMap<String, u64>> {
|
||||
let mut distribution = BTreeMap::new();
|
||||
|
||||
let db = self.index.facet_id_f64_docids;
|
||||
@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
let iter = self.index
|
||||
let iter = self
|
||||
.index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.rtxn, &[field_id])?
|
||||
@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> {
|
||||
// to those candidates. We also enter here for facet strings for performance reasons.
|
||||
let mut distribution = BTreeMap::new();
|
||||
if candidates.len() <= CANDIDATES_THRESHOLD {
|
||||
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?;
|
||||
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
|
||||
self.facet_distribution_from_documents(
|
||||
field_id,
|
||||
Number,
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
self.facet_distribution_from_documents(
|
||||
field_id,
|
||||
String,
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
} else {
|
||||
self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?;
|
||||
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
|
||||
self.facet_numbers_distribution_from_facet_levels(
|
||||
field_id,
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
self.facet_distribution_from_documents(
|
||||
field_id,
|
||||
String,
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
@ -201,10 +217,11 @@ impl<'a> FacetDistribution<'a> {
|
||||
|
||||
let mut distribution = BTreeMap::new();
|
||||
for name in filterable_fields {
|
||||
let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.clone(),
|
||||
process: "FacetDistribution::execute",
|
||||
})?;
|
||||
let fid =
|
||||
fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.clone(),
|
||||
process: "FacetDistribution::execute",
|
||||
})?;
|
||||
let values = self.facet_values(fid)?;
|
||||
distribution.insert(name, values);
|
||||
}
|
||||
@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
|
||||
impl fmt::Debug for FacetDistribution<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let FacetDistribution {
|
||||
facets,
|
||||
candidates,
|
||||
max_values_by_facet,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
} = self;
|
||||
let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self;
|
||||
|
||||
f.debug_struct("FacetDistribution")
|
||||
.field("facets", facets)
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Bound::{self, Included, Excluded};
|
||||
use std::ops::Bound::{self, Excluded, Included};
|
||||
use std::result::Result as StdResult;
|
||||
use std::str::FromStr;
|
||||
|
||||
@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs};
|
||||
use pest::Parser;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
|
||||
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result};
|
||||
|
||||
use super::FacetRange;
|
||||
use super::parser::Rule;
|
||||
use super::parser::{PREC_CLIMBER, FilterParser};
|
||||
|
||||
use self::FilterCondition::*;
|
||||
use self::Operator::*;
|
||||
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
|
||||
use super::FacetRange;
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum Operator {
|
||||
@ -39,13 +36,13 @@ impl Operator {
|
||||
/// an OR operation for the between case (i.e. `TO`).
|
||||
fn negate(self) -> (Self, Option<Self>) {
|
||||
match self {
|
||||
GreaterThan(n) => (LowerThanOrEqual(n), None),
|
||||
GreaterThan(n) => (LowerThanOrEqual(n), None),
|
||||
GreaterThanOrEqual(n) => (LowerThan(n), None),
|
||||
Equal(n, s) => (NotEqual(n, s), None),
|
||||
NotEqual(n, s) => (Equal(n, s), None),
|
||||
LowerThan(n) => (GreaterThanOrEqual(n), None),
|
||||
LowerThanOrEqual(n) => (GreaterThan(n), None),
|
||||
Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
|
||||
Equal(n, s) => (NotEqual(n, s), None),
|
||||
NotEqual(n, s) => (Equal(n, s), None),
|
||||
LowerThan(n) => (GreaterThanOrEqual(n), None),
|
||||
LowerThanOrEqual(n) => (GreaterThan(n), None),
|
||||
Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -63,10 +60,11 @@ impl FilterCondition {
|
||||
index: &Index,
|
||||
array: I,
|
||||
) -> Result<Option<FilterCondition>>
|
||||
where I: IntoIterator<Item=Either<J, B>>,
|
||||
J: IntoIterator<Item=A>,
|
||||
A: AsRef<str>,
|
||||
B: AsRef<str>,
|
||||
where
|
||||
I: IntoIterator<Item = Either<J, B>>,
|
||||
J: IntoIterator<Item = A>,
|
||||
A: AsRef<str>,
|
||||
B: AsRef<str>,
|
||||
{
|
||||
let mut ands = None;
|
||||
|
||||
@ -88,7 +86,7 @@ impl FilterCondition {
|
||||
None => Some(rule),
|
||||
};
|
||||
}
|
||||
},
|
||||
}
|
||||
Either::Right(rule) => {
|
||||
let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?;
|
||||
ands = match ands.take() {
|
||||
@ -106,11 +104,11 @@ impl FilterCondition {
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
expression: &str,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let filterable_fields = index.filterable_fields_ids(rtxn)?;
|
||||
let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
|
||||
let lexed =
|
||||
FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
|
||||
FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed)
|
||||
}
|
||||
|
||||
@ -118,8 +116,7 @@ impl FilterCondition {
|
||||
fim: &FieldsIdsMap,
|
||||
ff: &HashSet<FieldId>,
|
||||
expression: Pairs<Rule>,
|
||||
) -> Result<Self>
|
||||
{
|
||||
) -> Result<Self> {
|
||||
PREC_CLIMBER.climb(
|
||||
expression,
|
||||
|pair: Pair<Rule>| match pair.as_rule() {
|
||||
@ -135,12 +132,10 @@ impl FilterCondition {
|
||||
Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
|
||||
_ => unreachable!(),
|
||||
},
|
||||
|lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| {
|
||||
match op.as_rule() {
|
||||
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
|
||||
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| match op.as_rule() {
|
||||
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
|
||||
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
|
||||
_ => unreachable!(),
|
||||
},
|
||||
)
|
||||
}
|
||||
@ -160,8 +155,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -179,8 +173,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -196,8 +189,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -213,8 +205,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -230,8 +221,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -247,8 +237,7 @@ impl FilterCondition {
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
item: Pair<Rule>,
|
||||
) -> Result<FilterCondition>
|
||||
{
|
||||
) -> Result<FilterCondition> {
|
||||
let mut items = item.into_inner();
|
||||
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
|
||||
.map_err(UserError::InvalidFilterAttribute)?;
|
||||
@ -272,13 +261,14 @@ impl FilterCondition {
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
output: &mut RoaringBitmap,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
match (left, right) {
|
||||
// If the request is an exact value we must go directly to the deepest level.
|
||||
(Included(l), Included(r)) if l == r && level > 0 => {
|
||||
return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output);
|
||||
},
|
||||
return Self::explore_facet_number_levels(
|
||||
rtxn, db, field_id, 0, left, right, output,
|
||||
);
|
||||
}
|
||||
// lower TO upper when lower > upper must return no result
|
||||
(Included(l), Included(r)) if l > r => return Ok(()),
|
||||
(Included(l), Excluded(r)) if l >= r => return Ok(()),
|
||||
@ -301,7 +291,9 @@ impl FilterCondition {
|
||||
debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
|
||||
output.union_with(&docids);
|
||||
// We save the leftest and rightest bounds we actually found at this level.
|
||||
if i == 0 { left_found = Some(l); }
|
||||
if i == 0 {
|
||||
left_found = Some(l);
|
||||
}
|
||||
right_found = Some(r);
|
||||
}
|
||||
|
||||
@ -318,20 +310,50 @@ impl FilterCondition {
|
||||
// If the bound is satisfied we avoid calling this function again.
|
||||
if !matches!(left, Included(l) if l == left_found) {
|
||||
let sub_right = Excluded(left_found);
|
||||
debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level);
|
||||
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?;
|
||||
debug!(
|
||||
"calling left with {:?} to {:?} (level {})",
|
||||
left, sub_right, deeper_level
|
||||
);
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
deeper_level,
|
||||
left,
|
||||
sub_right,
|
||||
output,
|
||||
)?;
|
||||
}
|
||||
if !matches!(right, Included(r) if r == right_found) {
|
||||
let sub_left = Excluded(right_found);
|
||||
debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level);
|
||||
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?;
|
||||
debug!(
|
||||
"calling right with {:?} to {:?} (level {})",
|
||||
sub_left, right, deeper_level
|
||||
);
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
deeper_level,
|
||||
sub_left,
|
||||
right,
|
||||
output,
|
||||
)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
None => {
|
||||
// If we found nothing at this level it means that we must find
|
||||
// the same bounds but at a deeper, more precise level.
|
||||
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?;
|
||||
},
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
deeper_level,
|
||||
left,
|
||||
right,
|
||||
output,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -344,27 +366,34 @@ impl FilterCondition {
|
||||
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
operator: &Operator,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
// Make sure we always bound the ranges with the field id and the level,
|
||||
// as the facets values are all in the same database and prefixed by the
|
||||
// field id and the level.
|
||||
let (left, right) = match operator {
|
||||
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
|
||||
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
|
||||
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
|
||||
Equal(number, string) => {
|
||||
Equal(number, string) => {
|
||||
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
|
||||
let number_docids = match number {
|
||||
Some(n) => {
|
||||
let n = Included(*n);
|
||||
let mut output = RoaringBitmap::new();
|
||||
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?;
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
numbers_db,
|
||||
field_id,
|
||||
0,
|
||||
n,
|
||||
n,
|
||||
&mut output,
|
||||
)?;
|
||||
output
|
||||
},
|
||||
}
|
||||
None => RoaringBitmap::new(),
|
||||
};
|
||||
return Ok(string_docids | number_docids);
|
||||
},
|
||||
}
|
||||
NotEqual(number, string) => {
|
||||
let all_numbers_ids = if number.is_some() {
|
||||
index.number_faceted_documents_ids(rtxn, field_id)?
|
||||
@ -373,12 +402,14 @@ impl FilterCondition {
|
||||
};
|
||||
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
|
||||
let operator = Equal(*number, string.clone());
|
||||
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?;
|
||||
let docids = Self::evaluate_operator(
|
||||
rtxn, index, numbers_db, strings_db, field_id, &operator,
|
||||
)?;
|
||||
return Ok((all_numbers_ids | all_strings_ids) - docids);
|
||||
},
|
||||
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
|
||||
}
|
||||
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
|
||||
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
|
||||
Between(left, right) => (Included(*left), Included(*right)),
|
||||
Between(left, right) => (Included(*left), Included(*right)),
|
||||
};
|
||||
|
||||
// Ask for the biggest value that can exist for this specific field, if it exists
|
||||
@ -391,36 +422,39 @@ impl FilterCondition {
|
||||
match biggest_level {
|
||||
Some(level) => {
|
||||
let mut output = RoaringBitmap::new();
|
||||
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?;
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
numbers_db,
|
||||
field_id,
|
||||
level,
|
||||
left,
|
||||
right,
|
||||
&mut output,
|
||||
)?;
|
||||
Ok(output)
|
||||
},
|
||||
}
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn evaluate(
|
||||
&self,
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||
let numbers_db = index.facet_id_f64_docids;
|
||||
let strings_db = index.facet_id_string_docids;
|
||||
|
||||
match self {
|
||||
Operator(fid, op) => {
|
||||
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
|
||||
},
|
||||
}
|
||||
Or(lhs, rhs) => {
|
||||
let lhs = lhs.evaluate(rtxn, index)?;
|
||||
let rhs = rhs.evaluate(rtxn, index)?;
|
||||
Ok(lhs | rhs)
|
||||
},
|
||||
}
|
||||
And(lhs, rhs) => {
|
||||
let lhs = lhs.evaluate(rtxn, index)?;
|
||||
let rhs = rhs.evaluate(rtxn, index)?;
|
||||
Ok(lhs & rhs)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -434,23 +468,24 @@ fn field_id(
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
filterable_fields: &HashSet<FieldId>,
|
||||
items: &mut Pairs<Rule>,
|
||||
) -> StdResult<FieldId, PestError<Rule>>
|
||||
{
|
||||
) -> StdResult<FieldId, PestError<Rule>> {
|
||||
// lexing ensures that we at least have a key
|
||||
let key = items.next().unwrap();
|
||||
|
||||
let field_id = match fields_ids_map.id(key.as_str()) {
|
||||
Some(field_id) => field_id,
|
||||
None => return Err(PestError::new_from_span(
|
||||
ErrorVariant::CustomError {
|
||||
message: format!(
|
||||
"attribute `{}` not found, available attributes are: {}",
|
||||
key.as_str(),
|
||||
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "),
|
||||
),
|
||||
},
|
||||
key.as_span(),
|
||||
)),
|
||||
None => {
|
||||
return Err(PestError::new_from_span(
|
||||
ErrorVariant::CustomError {
|
||||
message: format!(
|
||||
"attribute `{}` not found, available attributes are: {}",
|
||||
key.as_str(),
|
||||
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "),
|
||||
),
|
||||
},
|
||||
key.as_span(),
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
if !filterable_fields.contains(&field_id) {
|
||||
@ -459,9 +494,11 @@ fn field_id(
|
||||
message: format!(
|
||||
"attribute `{}` is not filterable, available filterable attributes are: {}",
|
||||
key.as_str(),
|
||||
filterable_fields.iter().flat_map(|id| {
|
||||
fields_ids_map.name(*id)
|
||||
}).collect::<Vec<_>>().join(", "),
|
||||
filterable_fields
|
||||
.iter()
|
||||
.flat_map(|id| { fields_ids_map.name(*id) })
|
||||
.collect::<Vec<_>>()
|
||||
.join(", "),
|
||||
),
|
||||
},
|
||||
key.as_span(),
|
||||
@ -476,8 +513,9 @@ fn field_id(
|
||||
///
|
||||
/// Returns the parsing error associated with the span if the conversion fails.
|
||||
fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String)
|
||||
where T: FromStr,
|
||||
T::Err: ToString,
|
||||
where
|
||||
T: FromStr,
|
||||
T::Err: ToString,
|
||||
{
|
||||
let result = match pair.as_str().parse::<T>() {
|
||||
Ok(value) => Ok(value),
|
||||
@ -492,11 +530,12 @@ where T: FromStr,
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::update::Settings;
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::hashset;
|
||||
use big_s::S;
|
||||
|
||||
use super::*;
|
||||
use crate::update::Settings;
|
||||
|
||||
#[test]
|
||||
fn string() {
|
||||
@ -508,7 +547,7 @@ mod tests {
|
||||
// Set the filterable fields to be the channel.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_filterable_fields(hashset!{ S("channel") });
|
||||
builder.set_filterable_fields(hashset! { S("channel") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
@ -537,7 +576,7 @@ mod tests {
|
||||
// Set the filterable fields to be the channel.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_filterable_fields(hashset!{ "timestamp".into() });
|
||||
builder.set_filterable_fields(hashset! { "timestamp".into() });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
@ -548,10 +587,8 @@ mod tests {
|
||||
assert_eq!(condition, expected);
|
||||
|
||||
let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
|
||||
let expected = Or(
|
||||
Box::new(Operator(0, LowerThan(22.0))),
|
||||
Box::new(Operator(0, GreaterThan(44.0))),
|
||||
);
|
||||
let expected =
|
||||
Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0))));
|
||||
assert_eq!(condition, expected);
|
||||
}
|
||||
|
||||
@ -566,29 +603,33 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
|
||||
builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") });
|
||||
builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Test that the facet condition is correctly generated.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let condition = FilterCondition::from_str(
|
||||
&rtxn, &index,
|
||||
&rtxn,
|
||||
&index,
|
||||
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
let expected = Or(
|
||||
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
|
||||
Box::new(And(
|
||||
Box::new(Operator(1, Between(22.0, 44.0))),
|
||||
Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
|
||||
))
|
||||
)),
|
||||
);
|
||||
assert_eq!(condition, expected);
|
||||
|
||||
let condition = FilterCondition::from_str(
|
||||
&rtxn, &index,
|
||||
&rtxn,
|
||||
&index,
|
||||
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
let expected = Or(
|
||||
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
|
||||
Box::new(Or(
|
||||
@ -613,20 +654,28 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
|
||||
builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") });
|
||||
builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Test that the facet condition is correctly generated.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let condition = FilterCondition::from_array(
|
||||
&rtxn, &index,
|
||||
vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])],
|
||||
).unwrap().unwrap();
|
||||
&rtxn,
|
||||
&index,
|
||||
vec![
|
||||
Either::Right("channel = gotaga"),
|
||||
Either::Left(vec!["timestamp = 44", "channel != ponce"]),
|
||||
],
|
||||
)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let expected = FilterCondition::from_str(
|
||||
&rtxn, &index,
|
||||
&rtxn,
|
||||
&index,
|
||||
"channel = gotaga AND (timestamp = 44 OR channel != ponce)",
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(condition, expected);
|
||||
}
|
||||
}
|
||||
|
@ -1,20 +1,19 @@
|
||||
use std::ops::Bound::{self, Included, Excluded, Unbounded};
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
|
||||
use either::Either::{self, Left, Right};
|
||||
use heed::types::{DecodeIgnore, ByteSlice};
|
||||
use heed::{Database, RoRange, RoRevRange, LazyDecode};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{Database, LazyDecode, RoRange, RoRevRange};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::{Index, FieldId};
|
||||
|
||||
pub use self::facet_distribution::FacetDistribution;
|
||||
pub use self::filter_condition::{FilterCondition, Operator};
|
||||
pub(crate) use self::parser::Rule as ParserRule;
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::{FieldId, Index};
|
||||
|
||||
mod filter_condition;
|
||||
mod facet_distribution;
|
||||
mod filter_condition;
|
||||
mod parser;
|
||||
|
||||
pub struct FacetRange<'t> {
|
||||
@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> {
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetRange<'t>>
|
||||
{
|
||||
) -> heed::Result<FacetRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> {
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetRevRange<'t>>
|
||||
{
|
||||
) -> heed::Result<FacetRevRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> {
|
||||
}
|
||||
}
|
||||
continue;
|
||||
},
|
||||
}
|
||||
Some(Err(e)) => return Some(Err(e)),
|
||||
None => return None,
|
||||
}
|
||||
@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> {
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>>
|
||||
{
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let highest_iter =
|
||||
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> {
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>>
|
||||
{
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let highest_iter =
|
||||
FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Right(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> {
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>>
|
||||
{
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let highest_iter =
|
||||
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
|
||||
}
|
||||
@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> {
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, X>,
|
||||
fid: FieldId,
|
||||
) -> heed::Result<Option<u8>>
|
||||
{
|
||||
let level = db.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
) -> heed::Result<Option<u8>> {
|
||||
let level = db
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.prefix_iter(rtxn, &[fid][..])?
|
||||
.remap_key_type::<FacetLevelValueF64Codec>()
|
||||
.last().transpose()?
|
||||
.last()
|
||||
.transpose()?
|
||||
.map(|((_, level, _, _), _)| level);
|
||||
Ok(level)
|
||||
}
|
||||
@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> {
|
||||
|
||||
match result {
|
||||
Ok(((_fid, level, left, right), mut docids)) => {
|
||||
|
||||
docids.intersect_with(&documents_ids);
|
||||
if !docids.is_empty() {
|
||||
if self.must_reduce {
|
||||
@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> {
|
||||
Ok(iter) => {
|
||||
self.level_iters.push((docids, iter));
|
||||
continue 'outer;
|
||||
},
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use pest::prec_climber::{Operator, Assoc, PrecClimber};
|
||||
use pest::prec_climber::{Assoc, Operator, PrecClimber};
|
||||
|
||||
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
|
||||
use Assoc::*;
|
||||
|
@ -1,13 +1,11 @@
|
||||
use std::collections::HashSet;
|
||||
use std::cmp::{min, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use levenshtein_automata::{DFA, Distance};
|
||||
|
||||
use crate::search::query_tree::{Operation, Query};
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
|
||||
use super::build_dfa;
|
||||
use crate::search::query_tree::{Operation, Query};
|
||||
|
||||
type IsPrefix = bool;
|
||||
|
||||
@ -28,7 +26,9 @@ impl MatchingWords {
|
||||
.collect();
|
||||
// Sort word by len in DESC order prioritizing the longuest word,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len()));
|
||||
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
|
||||
Reverse(query_word.len())
|
||||
});
|
||||
Self { dfas }
|
||||
}
|
||||
|
||||
@ -37,12 +37,13 @@ impl MatchingWords {
|
||||
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) {
|
||||
Distance::Exact(t) if t <= *typo => {
|
||||
if *is_prefix {
|
||||
let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes());
|
||||
let (_dist, len) =
|
||||
prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes());
|
||||
Some(len)
|
||||
} else {
|
||||
Some(word.len())
|
||||
}
|
||||
},
|
||||
}
|
||||
_otherwise => None,
|
||||
})
|
||||
}
|
||||
@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
||||
match tree {
|
||||
Operation::Or(_, ops) | Operation::And(ops) => {
|
||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
||||
},
|
||||
}
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||
out.insert((kind.word(), typo, *prefix));
|
||||
},
|
||||
}
|
||||
Operation::Phrase(words) => {
|
||||
for word in words {
|
||||
out.insert((word, 0, false));
|
||||
@ -80,10 +81,7 @@ struct N2Array<T> {
|
||||
|
||||
impl<T: Clone> N2Array<T> {
|
||||
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
|
||||
N2Array {
|
||||
y_size: y,
|
||||
buf: vec![value; x * y],
|
||||
}
|
||||
N2Array { y_size: y, buf: vec![value; x * y] }
|
||||
}
|
||||
}
|
||||
|
||||
@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::MatchingWords;
|
||||
use crate::search::query_tree::{Operation, Query, QueryKind};
|
||||
use crate::MatchingWords;
|
||||
|
||||
#[test]
|
||||
fn matched_length() {
|
||||
@ -194,13 +191,23 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
|
||||
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }),
|
||||
]),
|
||||
]);
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: true,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: true,
|
||||
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||
}),
|
||||
])],
|
||||
);
|
||||
|
||||
let matching_words = MatchingWords::from_query_tree(&query_tree);
|
||||
|
||||
|
@ -6,6 +6,7 @@ use std::result::Result as StdResult;
|
||||
use std::str::Utf8Error;
|
||||
use std::time::Instant;
|
||||
|
||||
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use log::debug;
|
||||
@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub(crate) use self::facet::ParserRule;
|
||||
pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator};
|
||||
pub use self::matching_words::MatchingWords;
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::search::criteria::r#final::{Final, FinalResult};
|
||||
use crate::{Index, DocumentId, Result};
|
||||
|
||||
pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator};
|
||||
pub use self::matching_words::MatchingWords;
|
||||
pub(crate) use self::facet::ParserRule;
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
|
||||
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
|
||||
use crate::{DocumentId, Index, Result};
|
||||
|
||||
// Building these factories is not free.
|
||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
@ -32,8 +30,8 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
mod criteria;
|
||||
mod distinct;
|
||||
mod facet;
|
||||
mod query_tree;
|
||||
mod matching_words;
|
||||
mod query_tree;
|
||||
|
||||
pub struct Search<'a> {
|
||||
query: Option<String>,
|
||||
@ -117,7 +115,7 @@ impl<'a> Search<'a> {
|
||||
let result = analyzer.analyze(query);
|
||||
let tokens = result.tokens();
|
||||
builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq)))
|
||||
},
|
||||
}
|
||||
None => (None, None),
|
||||
};
|
||||
|
||||
@ -144,10 +142,11 @@ impl<'a> Search<'a> {
|
||||
None => self.perform_sort(NoopDistinct, matching_words, criteria),
|
||||
Some(name) => {
|
||||
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "distinct attribute",
|
||||
})?;
|
||||
let id =
|
||||
field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: name.to_string(),
|
||||
process: "distinct attribute",
|
||||
})?;
|
||||
let distinct = FacetDistinct::new(id, self.index, self.rtxn);
|
||||
self.perform_sort(distinct, matching_words, criteria)
|
||||
}
|
||||
@ -159,14 +158,15 @@ impl<'a> Search<'a> {
|
||||
mut distinct: D,
|
||||
matching_words: MatchingWords,
|
||||
mut criteria: Final,
|
||||
) -> Result<SearchResult>
|
||||
{
|
||||
) -> Result<SearchResult> {
|
||||
let mut offset = self.offset;
|
||||
let mut initial_candidates = RoaringBitmap::new();
|
||||
let mut excluded_candidates = RoaringBitmap::new();
|
||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||
|
||||
while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? {
|
||||
while let Some(FinalResult { candidates, bucket_candidates, .. }) =
|
||||
criteria.next(&excluded_candidates)?
|
||||
{
|
||||
debug!("Number of candidates found {}", candidates.len());
|
||||
|
||||
let excluded = take(&mut excluded_candidates);
|
||||
@ -183,7 +183,9 @@ impl<'a> Search<'a> {
|
||||
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
|
||||
documents_ids.push(candidate?);
|
||||
}
|
||||
if documents_ids.len() == self.limit { break }
|
||||
if documents_ids.len() == self.limit {
|
||||
break;
|
||||
}
|
||||
excluded_candidates = candidates.into_excluded();
|
||||
}
|
||||
|
||||
@ -247,7 +249,7 @@ pub fn word_derivations<'c>(
|
||||
}
|
||||
|
||||
Ok(entry.insert(derived_words))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,7 @@
|
||||
use std::iter::{Chain, FromIterator};
|
||||
use std::ops::RangeInclusive;
|
||||
use roaring::bitmap::{RoaringBitmap, IntoIter};
|
||||
|
||||
use roaring::bitmap::{IntoIter, RoaringBitmap};
|
||||
|
||||
pub struct AvailableDocumentsIds {
|
||||
iter: Chain<IntoIter, RangeInclusive<u32>>,
|
||||
@ -18,16 +19,12 @@ impl AvailableDocumentsIds {
|
||||
None => 1..=0, // empty range iterator
|
||||
};
|
||||
|
||||
AvailableDocumentsIds {
|
||||
iter: available.into_iter().chain(iter),
|
||||
}
|
||||
},
|
||||
AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
|
||||
}
|
||||
None => {
|
||||
let empty = RoaringBitmap::new().into_iter();
|
||||
AvailableDocumentsIds {
|
||||
iter: empty.chain(0..=u32::max_value()),
|
||||
}
|
||||
},
|
||||
AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use chrono::Utc;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result};
|
||||
use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64
|
||||
update_id: u64,
|
||||
) -> ClearDocuments<'t, 'u, 'i> {
|
||||
|
||||
ClearDocuments { wtxn, index, _update_id: update_id }
|
||||
}
|
||||
|
||||
@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
mod tests {
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use super::*;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
|
||||
#[test]
|
||||
fn clear_documents() {
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::Utc;
|
||||
use fst::IntoStreamer;
|
||||
@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::error::{InternalError, FieldIdMapMissingEntry, UserError};
|
||||
use super::ClearDocuments;
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result};
|
||||
use super::ClearDocuments;
|
||||
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
||||
|
||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64,
|
||||
) -> Result<DeleteDocuments<'t, 'u, 'i>>
|
||||
{
|
||||
let external_documents_ids = index
|
||||
.external_documents_ids(wtxn)?
|
||||
.into_static();
|
||||
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
|
||||
let external_documents_ids = index.external_documents_ids(wtxn)?.into_static();
|
||||
|
||||
Ok(DeleteDocuments {
|
||||
wtxn,
|
||||
@ -84,12 +81,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||
}
|
||||
})?;
|
||||
let id_field = fields_ids_map.id(primary_key).ok_or_else(|| {
|
||||
FieldIdMapMissingEntry::FieldName {
|
||||
let id_field =
|
||||
fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
|
||||
field_name: primary_key.to_string(),
|
||||
process: "DeleteDocuments::execute",
|
||||
}
|
||||
})?;
|
||||
})?;
|
||||
|
||||
let Index {
|
||||
env: _env,
|
||||
@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
let external_id = match serde_json::from_slice(content).unwrap() {
|
||||
Value::String(string) => SmallString32::from(string.as_str()),
|
||||
Value::Number(number) => SmallString32::from(number.to_string()),
|
||||
document_id => return Err(UserError::InvalidDocumentId { document_id }.into()),
|
||||
document_id => {
|
||||
return Err(UserError::InvalidDocumentId { document_id }.into())
|
||||
}
|
||||
};
|
||||
external_ids.push(external_id);
|
||||
}
|
||||
@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
|
||||
match entry.get().checked_sub(count_diff) {
|
||||
Some(0) | None => entry.remove(),
|
||||
Some(count) => entry.insert(count)
|
||||
Some(count) => entry.insert(count),
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
}
|
||||
|
||||
// We construct an FST set that contains the words to delete from the words FST.
|
||||
let words_to_delete = words.iter().filter_map(|(word, must_remove)| {
|
||||
if *must_remove { Some(word.as_ref()) } else { None }
|
||||
});
|
||||
let words_to_delete =
|
||||
words.iter().filter_map(
|
||||
|(word, must_remove)| {
|
||||
if *must_remove {
|
||||
Some(word.as_ref())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
|
||||
|
||||
let new_words_fst = {
|
||||
@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
// We delete the documents ids that are under the pairs of words,
|
||||
// it is faster and use no memory to iterate over all the words pairs than
|
||||
// to compute the cartesian product of every words of the deleted documents.
|
||||
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
|
||||
let mut iter =
|
||||
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word level position docids.
|
||||
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
let mut iter =
|
||||
word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word prefix level position docids.
|
||||
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
let mut iter =
|
||||
word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
|
||||
convert: F,
|
||||
) -> heed::Result<()>
|
||||
where
|
||||
C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>,
|
||||
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
|
||||
F: Fn(K) -> DocumentId,
|
||||
{
|
||||
let mut iter = db.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter_mut(wtxn, &[field_id])?
|
||||
.remap_key_type::<C>();
|
||||
let mut iter =
|
||||
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
|
||||
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, ()) = result?;
|
||||
@ -441,8 +448,8 @@ where
|
||||
mod tests {
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use super::*;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_numbers_as_primary_key() {
|
||||
|
@ -3,17 +3,18 @@ use std::fs::File;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use chrono::Utc;
|
||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||
use grenad::{CompressionType, FileFuse, Reader, Writer};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesEncode, Error};
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::index_documents::{
|
||||
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
|
||||
pub struct Facets<'t, 'u, 'i> {
|
||||
@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64,
|
||||
) -> Facets<'t, 'u, 'i>
|
||||
{
|
||||
) -> Facets<'t, 'u, 'i> {
|
||||
Facets {
|
||||
wtxn,
|
||||
index,
|
||||
@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
)?;
|
||||
|
||||
// Clear the facet number levels.
|
||||
clear_field_number_levels(
|
||||
self.wtxn,
|
||||
self.index.facet_id_f64_docids,
|
||||
field_id,
|
||||
)?;
|
||||
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
|
||||
|
||||
// Compute and store the faceted numbers documents ids.
|
||||
let number_documents_ids = compute_faceted_documents_ids(
|
||||
@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?;
|
||||
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?;
|
||||
self.index.put_string_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
&string_documents_ids,
|
||||
)?;
|
||||
self.index.put_number_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
&number_documents_ids,
|
||||
)?;
|
||||
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
fn clear_field_number_levels<'t, >(
|
||||
fn clear_field_number_levels<'t>(
|
||||
wtxn: &'t mut heed::RwTxn,
|
||||
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: u8,
|
||||
) -> heed::Result<()>
|
||||
{
|
||||
) -> heed::Result<()> {
|
||||
let left = (field_id, 1, f64::MIN, f64::MIN);
|
||||
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
|
||||
let range = left..=right;
|
||||
@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>(
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
field_id: u8,
|
||||
) -> Result<Reader<FileFuse>>
|
||||
{
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let first_level_size = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &[field_id])?
|
||||
@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>(
|
||||
|
||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(compression_type, compression_level, file)
|
||||
})?;
|
||||
let mut writer = tempfile::tempfile()
|
||||
.and_then(|file| create_writer(compression_type, compression_level, file))?;
|
||||
|
||||
let level_0_range = {
|
||||
let left = (field_id, 0, f64::MIN, f64::MIN);
|
||||
@ -196,8 +197,7 @@ fn compute_faceted_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||
field_id: u8,
|
||||
) -> Result<RoaringBitmap>
|
||||
{
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
|
||||
for result in db.prefix_iter(rtxn, &[field_id])? {
|
||||
@ -215,8 +215,7 @@ fn write_number_entry(
|
||||
left: f64,
|
||||
right: f64,
|
||||
ids: &RoaringBitmap,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let key = (field_id, level, left, right);
|
||||
let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?;
|
||||
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, SeekFrom, BufReader, BufRead};
|
||||
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::result::Result as StdResult;
|
||||
use std::str;
|
||||
@ -10,28 +10,26 @@ use std::time::Instant;
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use chrono::Utc;
|
||||
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
|
||||
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
|
||||
use heed::types::ByteSlice;
|
||||
use log::{debug, info, error};
|
||||
use log::{debug, error, info};
|
||||
use memmap::Mmap;
|
||||
use rayon::prelude::*;
|
||||
use rayon::ThreadPool;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::{Index, Result};
|
||||
use crate::update::{
|
||||
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
|
||||
WordPrefixPairProximityDocids,
|
||||
};
|
||||
use self::store::{Store, Readers};
|
||||
pub use self::merge_function::{
|
||||
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
};
|
||||
use self::store::{Readers, Store};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
|
||||
use crate::MergeFn;
|
||||
use super::UpdateBuilder;
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::update::{
|
||||
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
|
||||
WordsLevelPositions, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, MergeFn, Result};
|
||||
|
||||
mod merge_function;
|
||||
mod store;
|
||||
@ -48,7 +46,11 @@ pub enum WriteMethod {
|
||||
GetMergePut,
|
||||
}
|
||||
|
||||
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
|
||||
pub fn create_writer(
|
||||
typ: CompressionType,
|
||||
level: Option<u32>,
|
||||
file: File,
|
||||
) -> io::Result<Writer<File>> {
|
||||
let mut builder = Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> Sorter<MergeFn<E>>
|
||||
{
|
||||
) -> Sorter<MergeFn<E>> {
|
||||
let mut builder = Sorter::builder(merge);
|
||||
if let Some(shrink_size) = chunk_fusing_shrink_size {
|
||||
builder.file_fusing_shrink_size(shrink_size);
|
||||
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> {
|
||||
pub fn writer_into_reader(
|
||||
writer: Writer<File>,
|
||||
shrink_size: Option<u64>,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
let file = if let Some(shrink_size) = shrink_size {
|
||||
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
|
||||
pub fn merge_readers<E>(
|
||||
sources: Vec<Reader<FileFuse>>,
|
||||
merge: MergeFn<E>,
|
||||
) -> Merger<FileFuse, MergeFn<E>>
|
||||
{
|
||||
) -> Merger<FileFuse, MergeFn<E>> {
|
||||
let mut builder = Merger::builder(merge);
|
||||
builder.extend(sources);
|
||||
builder.build()
|
||||
@ -118,13 +121,7 @@ where
|
||||
let before = Instant::now();
|
||||
|
||||
let merger = merge_readers(sources, merge);
|
||||
merger_iter_into_lmdb_database(
|
||||
wtxn,
|
||||
database,
|
||||
merger.into_merge_iter()?,
|
||||
merge,
|
||||
method,
|
||||
)?;
|
||||
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
@ -149,7 +146,7 @@ where
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
out_iter.append(k, v)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
@ -158,11 +155,11 @@ where
|
||||
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
|
||||
let val = merge(k, &vals)?;
|
||||
iter.put_current(k, &val)?;
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
Error: From<grenad::Error<E>>
|
||||
Error: From<grenad::Error<E>>,
|
||||
{
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
merger_iter_into_lmdb_database(
|
||||
wtxn,
|
||||
database,
|
||||
sorter.into_iter()?,
|
||||
merge,
|
||||
method,
|
||||
)?;
|
||||
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
@ -214,7 +205,7 @@ where
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
out_iter.append(k, v)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
@ -226,14 +217,14 @@ where
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
iter.put_current(k, &val)?;
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
// Early return when there is no document to add
|
||||
if reader.buffer().is_empty() {
|
||||
return Ok(DocumentAdditionResult {
|
||||
nb_documents: 0,
|
||||
})
|
||||
return Ok(DocumentAdditionResult { nb_documents: 0 });
|
||||
}
|
||||
|
||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
let output = match self.update_format {
|
||||
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
|
||||
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => {
|
||||
transform.output_from_json_stream(reader, &progress_callback)?
|
||||
}
|
||||
};
|
||||
|
||||
let nb_documents = output.documents_count;
|
||||
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let before_indexing = Instant::now();
|
||||
|
||||
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
|
||||
&backup_pool
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let readers = pool.install(|| {
|
||||
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let contains_documents = !documents_ids.is_empty();
|
||||
let write_method = if contains_documents {
|
||||
WriteMethod::GetMergePut
|
||||
} else {
|
||||
WriteMethod::Append
|
||||
};
|
||||
let write_method =
|
||||
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
|
||||
|
||||
debug!("Writing using the write method: {:?}", write_method);
|
||||
|
||||
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
*self.index.docid_word_positions.as_polymorph(),
|
||||
docid_word_positions_readers,
|
||||
keep_first,
|
||||
write_method
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
*self.index.documents.as_polymorph(),
|
||||
documents_readers,
|
||||
keep_first,
|
||||
write_method
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
fst_merge,
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::WordDocids => {
|
||||
debug!("Writing the words docids into LMDB on disk...");
|
||||
let db = *self.index.word_docids.as_polymorph();
|
||||
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::FacetLevel0NumbersDocids => {
|
||||
debug!("Writing the facet numbers docids into LMDB on disk...");
|
||||
let db = *self.index.facet_id_f64_docids.as_polymorph();
|
||||
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::FieldIdWordCountDocids => {
|
||||
debug!("Writing the field id word count docids into LMDB on disk...");
|
||||
let db = *self.index.field_id_word_count_docids.as_polymorph();
|
||||
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::WordLevel0PositionDocids => {
|
||||
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
||||
let db = *self.index.word_level_position_docids.as_polymorph();
|
||||
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
@ -1053,9 +1042,8 @@ mod tests {
|
||||
assert_eq!(count, 3);
|
||||
|
||||
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
|
||||
let (kevin_id, _) = docs.iter().find(|(_, d)| {
|
||||
d.get(0).unwrap() == br#""updated kevin""#
|
||||
}).unwrap();
|
||||
let (kevin_id, _) =
|
||||
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||
let (id, doc) = docs[*kevin_id as usize];
|
||||
assert_eq!(id, *kevin_id);
|
||||
|
||||
|
@ -8,25 +8,29 @@ use std::{cmp, iter};
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use fst::Set;
|
||||
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
||||
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
||||
use heed::BytesEncode;
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use log::{debug, info};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
|
||||
use meilisearch_tokenizer::token::SeparatorKind;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use super::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
};
|
||||
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
||||
use crate::error::{Error, InternalError, SerializationError};
|
||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
|
||||
FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result};
|
||||
|
||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
|
||||
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
|
||||
|
||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||
const ONE_KILOBYTE: usize = 1024 * 1024;
|
||||
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
|
||||
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||
word_docids_limit: usize,
|
||||
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids:
|
||||
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids_limit: usize,
|
||||
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
||||
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
|
||||
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
stop_words: Option<&'s Set<A>>,
|
||||
) -> Result<Self>
|
||||
{
|
||||
) -> Result<Self> {
|
||||
// We divide the max memory by the number of sorter the Store have.
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
|
||||
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
|
||||
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Some(1024 * 1024 * 1024), // 1MB
|
||||
);
|
||||
|
||||
let documents_writer = tempfile().and_then(|f| {
|
||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
let docid_word_positions_writer = tempfile().and_then(|f| {
|
||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
let documents_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
let docid_word_positions_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
|
||||
let mut config = AnalyzerConfig::default();
|
||||
if let Some(stop_words) = stop_words {
|
||||
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.word_docids.get_refresh(word.as_bytes()) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
let word_vec = SmallVec32::from(word.as_bytes());
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
field_id: FieldId,
|
||||
value: OrderedFloat<f64>,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
|
||||
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
|
||||
|
||||
let key = (field_id, value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_number_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
field_id: FieldId,
|
||||
value: String,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_strings_sorter;
|
||||
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
|
||||
|
||||
let key = (field_id, value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_string_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// Save the documents ids under the words pairs proximities that it contains.
|
||||
fn insert_words_pairs_proximities_docids<'a>(
|
||||
&mut self,
|
||||
words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>,
|
||||
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
for ((w1, w2), prox) in words_pairs_proximities {
|
||||
let w1 = SmallVec32::from(w1.as_bytes());
|
||||
let w2 = SmallVec32::from(w2.as_bytes());
|
||||
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// if get_refresh finds the element it is assured
|
||||
// to be at the end of the linked hash map.
|
||||
match self.words_pairs_proximities_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
let ids = RoaringBitmap::from_iter(Some(id));
|
||||
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// Removing front elements is equivalent to removing the LRUs.
|
||||
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
|
||||
iter.take(overflow).for_each(|x| lrus.push(x));
|
||||
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?;
|
||||
Self::write_words_pairs_proximities(
|
||||
&mut self.words_pairs_proximities_docids_sorter,
|
||||
lrus,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
|
||||
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
|
||||
record: &[u8],
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
|
||||
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
|
||||
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
|
||||
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
}
|
||||
|
||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
|
||||
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
|
||||
Self::write_docid_word_positions(
|
||||
&mut self.docid_word_positions_writer,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
Self::write_word_position_docids(
|
||||
&mut self.word_level_position_docids_sorter,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
|
||||
words_positions.clear();
|
||||
|
||||
@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
fn write_words_pairs_proximities<E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
|
||||
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
writer: &mut Writer<File>,
|
||||
id: DocumentId,
|
||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We prefix the words by the document id.
|
||||
let mut key = id.to_be_bytes().to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_string_docids<I, E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: I,
|
||||
) -> Result<()>
|
||||
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>,
|
||||
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_number_docids<I, E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: I,
|
||||
) -> Result<()>
|
||||
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
|
||||
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut data_buffer = Vec::new();
|
||||
@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>,
|
||||
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key = Vec::new();
|
||||
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
log_every_n: Option<usize>,
|
||||
mut progress_callback: F,
|
||||
) -> Result<Readers>
|
||||
where F: FnMut(UpdateIndexingStep),
|
||||
where
|
||||
F: FnMut(UpdateIndexingStep),
|
||||
{
|
||||
debug!("{:?}: Indexing in a Store...", thread_index);
|
||||
|
||||
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
if count % num_threads == thread_index {
|
||||
// This is a log routine that we do every `log_every_n` documents.
|
||||
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
|
||||
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
|
||||
info!(
|
||||
"We have seen {} documents so far ({:.02?}).",
|
||||
format_count(count),
|
||||
before.elapsed()
|
||||
);
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
}
|
||||
|
||||
for (attr, content) in document.iter() {
|
||||
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
|
||||
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
|
||||
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
|
||||
{
|
||||
let value =
|
||||
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (facet_numbers, facet_strings) = extract_facet_values(&value);
|
||||
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
|
||||
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
|
||||
facet_numbers_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_numbers);
|
||||
facet_strings_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_strings);
|
||||
|
||||
if self.searchable_fields.contains(&attr) {
|
||||
let content = match json_to_string(&value) {
|
||||
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||
last_pos = Some(pos);
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
||||
words_positions
|
||||
.entry(token.text().to_string())
|
||||
.or_insert_with(SmallVec32::new)
|
||||
.push(position);
|
||||
}
|
||||
|
||||
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
|
||||
let key = (attr, last_pos as u8 + 1);
|
||||
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
|
||||
self.field_id_word_count_docids
|
||||
.entry(key)
|
||||
.or_insert_with(RoaringBitmap::new)
|
||||
.insert(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
self.facet_field_string_docids,
|
||||
)?;
|
||||
|
||||
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut word_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
let mut iter = self.word_docids_sorter.into_iter()?;
|
||||
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.main_sorter.write_into(&mut main_wtr)?;
|
||||
|
||||
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
let mut words_pairs_proximities_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter
|
||||
.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
|
||||
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut word_level_position_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
||||
|
||||
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut field_id_word_count_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
|
||||
|
||||
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut facet_field_numbers_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
|
||||
|
||||
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut facet_field_strings_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
|
||||
let mut field_id_docid_facet_numbers_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_numbers_sorter
|
||||
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
|
||||
let mut field_id_docid_facet_strings_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_strings_sorter
|
||||
.write_into(&mut field_id_docid_facet_strings_wtr)?;
|
||||
|
||||
let main = writer_into_reader(main_wtr, shrink_size)?;
|
||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
|
||||
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
let words_pairs_proximities_docids =
|
||||
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids =
|
||||
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let field_id_word_count_docids =
|
||||
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||
let facet_field_numbers_docids =
|
||||
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||
let facet_field_strings_docids =
|
||||
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_numbers =
|
||||
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_strings =
|
||||
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
|
||||
let docid_word_positions =
|
||||
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
|
||||
|
||||
Ok(Readers {
|
||||
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
/// close to each other.
|
||||
fn compute_words_pair_proximities(
|
||||
word_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> HashMap<(&str, &str), u8>
|
||||
{
|
||||
) -> HashMap<(&str, &str), u8> {
|
||||
use itertools::Itertools;
|
||||
|
||||
let mut words_pair_proximities = HashMap::new();
|
||||
@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standart proximity of 1 between words.
|
||||
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
|
||||
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push(b.to_string()),
|
||||
Value::Number(number) => if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
},
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(string) => {
|
||||
let string = string.trim().to_lowercase();
|
||||
output_strings.push(string);
|
||||
},
|
||||
Value::Array(values) => if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
@ -10,14 +10,15 @@ use log::info;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::error::{Error, UserError, InternalError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
|
||||
use crate::{Index, Result};
|
||||
use super::merge_function::merge_two_obkvs;
|
||||
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
||||
use super::{create_sorter, create_writer, IndexDocumentsMethod};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
|
||||
};
|
||||
|
||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||
|
||||
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
|
||||
self.output_from_generic_json(reader, false, progress_callback)
|
||||
}
|
||||
|
||||
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
|
||||
pub fn output_from_json_stream<R, F>(
|
||||
self,
|
||||
reader: R,
|
||||
progress_callback: F,
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -86,14 +91,16 @@ impl Transform<'_, '_> {
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
// Deserialize the whole batch of documents in memory.
|
||||
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
|
||||
let mut documents: Peekable<
|
||||
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
|
||||
> = if is_stream {
|
||||
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
} else {
|
||||
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
|
||||
let iter = vec.into_iter().map(Ok);
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
};
|
||||
|
||||
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
|
||||
Err(_) => {
|
||||
let error = documents.next().unwrap().unwrap_err();
|
||||
return Err(UserError::SerdeJson(error).into());
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
|
||||
let alternative_name =
|
||||
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
|
||||
let (primary_key_id, primary_key) = compute_primary_key_pair(
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
if documents.peek().is_none() {
|
||||
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
|
||||
Some(value) => match value {
|
||||
Value::String(string) => Cow::Borrowed(string.as_str()),
|
||||
Value::Number(number) => Cow::Owned(number.to_string()),
|
||||
content => return Err(UserError::InvalidDocumentId {
|
||||
document_id: content.clone(),
|
||||
}.into()),
|
||||
content => {
|
||||
return Err(
|
||||
UserError::InvalidDocumentId { document_id: content.clone() }.into()
|
||||
)
|
||||
}
|
||||
},
|
||||
None => {
|
||||
if !self.autogenerate_docids {
|
||||
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
|
||||
}
|
||||
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
|
||||
Cow::Borrowed(uuid)
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// We iterate in the fields ids ordered.
|
||||
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
|
||||
// and this should be the document id we return the one we generated.
|
||||
if let Some(value) = document.get(name) {
|
||||
// We serialize the attribute values.
|
||||
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?;
|
||||
serde_json::to_writer(&mut json_buffer, value)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
|
||||
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}.into());
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,9 +260,9 @@ impl Transform<'_, '_> {
|
||||
// Extract the position of the primary key in the current headers, None if not found.
|
||||
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
|
||||
Some(primary_key) => {
|
||||
// The primary key is known so we must find the position in the CSV headers.
|
||||
headers.iter().position(|h| h == primary_key)
|
||||
},
|
||||
// The primary key is known so we must find the position in the CSV headers.
|
||||
headers.iter().position(|h| h == primary_key)
|
||||
}
|
||||
None => headers.iter().position(is_primary_key),
|
||||
};
|
||||
|
||||
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
// The primary key field is not present in the header, so we need to create it.
|
||||
@ -308,27 +320,30 @@ impl Transform<'_, '_> {
|
||||
// We validate the document id [a-zA-Z0-9\-_].
|
||||
match validate_document_id(&external_id) {
|
||||
Some(valid) => valid,
|
||||
None => return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}.into()),
|
||||
None => {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
|
||||
};
|
||||
|
||||
// When the primary_key_field_id is found in the fields ids list
|
||||
// we return the generated document id instead of the record field.
|
||||
let iter = fields_ids.iter()
|
||||
.map(|(fi, i)| {
|
||||
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
|
||||
(fi, field)
|
||||
});
|
||||
let iter = fields_ids.iter().map(|(fi, i)| {
|
||||
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
|
||||
(fi, field)
|
||||
});
|
||||
|
||||
// We retrieve the field id based on the fields ids map fields ids order.
|
||||
for (field_id, field) in iter {
|
||||
// We serialize the attribute values as JSON strings.
|
||||
json_buffer.clear();
|
||||
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?;
|
||||
serde_json::to_writer(&mut json_buffer, &field)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(*field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
|
||||
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
|
||||
IndexDocumentsMethod::UpdateDocuments => {
|
||||
let key = BEU32::new(docid);
|
||||
let base_obkv = self.index.documents.get(&self.rtxn, &key)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::DOCUMENTS,
|
||||
key: None,
|
||||
})?;
|
||||
},
|
||||
)?;
|
||||
let update_obkv = obkv::KvReader::new(update_obkv);
|
||||
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
|
||||
(docid, obkv_buffer.as_slice())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
None => {
|
||||
// If this user id is new we add it to the external documents ids map
|
||||
// for new ids and into the list of new documents.
|
||||
let new_docid = available_documents_ids.next()
|
||||
.ok_or(UserError::DocumentLimitReached)?;
|
||||
let new_docid =
|
||||
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
|
||||
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
|
||||
new_documents_ids.insert(new_docid);
|
||||
(new_docid, update_obkv)
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// We insert the document under the documents ids map into the final file.
|
||||
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
let mut writer =
|
||||
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
|
||||
// Once we have written all the documents into the final sorter, we write the documents
|
||||
// into this writer, extract the file and reset the seek to be able to read it again.
|
||||
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
|
||||
primary_key: String,
|
||||
old_fields_ids_map: FieldsIdsMap,
|
||||
new_fields_ids_map: FieldsIdsMap,
|
||||
) -> Result<TransformOutput>
|
||||
{
|
||||
) -> Result<TransformOutput> {
|
||||
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
let mut writer =
|
||||
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
for result in self.index.documents.iter(self.rtxn)? {
|
||||
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
|
||||
return Err(UserError::MissingPrimaryKey.into());
|
||||
}
|
||||
DEFAULT_PRIMARY_KEY_NAME.to_string()
|
||||
},
|
||||
}
|
||||
};
|
||||
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
|
||||
Ok((id, name))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
let document_id = document_id.trim();
|
||||
Some(document_id).filter(|id| {
|
||||
!id.is_empty() && id.chars().all(|c| {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')
|
||||
})
|
||||
!id.is_empty()
|
||||
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
})
|
||||
}
|
||||
|
||||
@ -583,8 +599,7 @@ mod test {
|
||||
use super::*;
|
||||
|
||||
mod compute_primary_key {
|
||||
use super::compute_primary_key_pair;
|
||||
use super::FieldsIdsMap;
|
||||
use super::{compute_primary_key_pair, FieldsIdsMap};
|
||||
|
||||
#[test]
|
||||
fn should_return_primary_key_if_is_some() {
|
||||
@ -594,7 +609,8 @@ mod test {
|
||||
Some("toto"),
|
||||
&mut fields_map,
|
||||
Some("tata".to_string()),
|
||||
false);
|
||||
false,
|
||||
);
|
||||
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
@ -602,11 +618,8 @@ mod test {
|
||||
#[test]
|
||||
fn should_return_alternative_if_primary_is_none() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
Some("tata".to_string()),
|
||||
false);
|
||||
let result =
|
||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
||||
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
@ -614,23 +627,15 @@ mod test {
|
||||
#[test]
|
||||
fn should_return_default_if_both_are_none() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
None,
|
||||
true);
|
||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
||||
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_return_err_if_both_are_none_and_recompute_is_false(){
|
||||
fn should_return_err_if_both_are_none_and_recompute_is_false() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
None,
|
||||
false);
|
||||
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(fields_map.len(), 0);
|
||||
}
|
||||
|
@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::delete_documents::DeleteDocuments;
|
||||
pub use self::facets::Facets;
|
||||
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat};
|
||||
pub use self::index_documents::{
|
||||
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
|
||||
};
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_builder::UpdateBuilder;
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
|
@ -34,17 +34,24 @@ impl<T> Setting<T> {
|
||||
}
|
||||
|
||||
impl<T: Serialize> Serialize for Setting<T> {
|
||||
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> where S: Serializer {
|
||||
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match self {
|
||||
Self::Set(value) => Some(value),
|
||||
// Usually not_set isn't serialized by setting skip_serializing_if field attribute
|
||||
Self::NotSet | Self::Reset => None,
|
||||
}.serialize(serializer)
|
||||
}
|
||||
.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
|
||||
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> where D: Deserializer<'de> {
|
||||
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
Deserialize::deserialize(deserializer).map(|x| match x {
|
||||
Some(x) => Self::Set(x),
|
||||
None => Self::Reset, // Reset is forced by sending null value
|
||||
@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
}
|
||||
|
||||
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
|
||||
self.stop_words = if stop_words.is_empty() {
|
||||
Setting::Reset
|
||||
} else {
|
||||
Setting::Set(stop_words)
|
||||
}
|
||||
self.stop_words =
|
||||
if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
|
||||
}
|
||||
|
||||
pub fn reset_distinct_field(&mut self) {
|
||||
@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
}
|
||||
|
||||
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
|
||||
self.synonyms = if synonyms.is_empty() {
|
||||
Setting::Reset
|
||||
} else {
|
||||
Setting::Set(synonyms)
|
||||
}
|
||||
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
|
||||
}
|
||||
|
||||
pub fn reset_primary_key(&mut self) {
|
||||
@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
|
||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep, u64) + Sync
|
||||
F: Fn(UpdateIndexingStep, u64) + Sync,
|
||||
{
|
||||
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
let update_id = self.update_id;
|
||||
@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
};
|
||||
|
||||
// There already has been a document addition, the primary key should be set by now.
|
||||
let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
|
||||
let primary_key =
|
||||
self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
|
||||
|
||||
// We remap the documents fields based on the new `FieldsIdsMap`.
|
||||
let output = transform.remap_index_documents(
|
||||
@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
Setting::Set(ref fields) => {
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
// fields are deduplicated, only the first occurrence is taken into account
|
||||
let names: Vec<_> = fields
|
||||
.iter()
|
||||
.unique()
|
||||
.map(String::as_str)
|
||||
.collect();
|
||||
let names: Vec<_> = fields.iter().unique().map(String::as_str).collect();
|
||||
|
||||
for name in names.iter() {
|
||||
fields_ids_map
|
||||
.insert(name)
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
|
||||
}
|
||||
self.index.put_displayed_fields(self.wtxn, &names)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
}
|
||||
Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; }
|
||||
Setting::Reset => {
|
||||
self.index.delete_displayed_fields(self.wtxn)?;
|
||||
}
|
||||
Setting::NotSet => return Ok(false),
|
||||
}
|
||||
Ok(true)
|
||||
@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
match self.distinct_field {
|
||||
Setting::Set(ref attr) => {
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
fields_ids_map
|
||||
.insert(attr)
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?;
|
||||
|
||||
self.index.put_distinct_field(self.wtxn, &attr)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
}
|
||||
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; },
|
||||
Setting::Reset => {
|
||||
self.index.delete_distinct_field(self.wtxn)?;
|
||||
}
|
||||
Setting::NotSet => return Ok(false),
|
||||
}
|
||||
Ok(true)
|
||||
@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
|
||||
let mut new_fields_ids_map = FieldsIdsMap::new();
|
||||
// fields are deduplicated, only the first occurrence is taken into account
|
||||
let names = fields
|
||||
.iter()
|
||||
.unique()
|
||||
.map(String::as_str)
|
||||
.collect::<Vec<_>>();
|
||||
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
|
||||
|
||||
// Add all the searchable attributes to the field map, and then add the
|
||||
// remaining fields from the old field map to the new one
|
||||
for name in names.iter() {
|
||||
new_fields_ids_map
|
||||
.insert(&name)
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
|
||||
}
|
||||
|
||||
for (_, name) in old_fields_ids_map.iter() {
|
||||
new_fields_ids_map
|
||||
.insert(&name)
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
|
||||
}
|
||||
|
||||
self.index.put_searchable_fields(self.wtxn, &names)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
|
||||
}
|
||||
Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; }
|
||||
Setting::Reset => {
|
||||
self.index.delete_searchable_fields(self.wtxn)?;
|
||||
}
|
||||
Setting::NotSet => return Ok(false),
|
||||
}
|
||||
Ok(true)
|
||||
@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
let fst = fst::Set::from_iter(stop_words)?;
|
||||
|
||||
// Does the new FST differ from the previous one?
|
||||
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
|
||||
if current
|
||||
.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes())
|
||||
{
|
||||
// we want to re-create our FST.
|
||||
self.index.put_stop_words(self.wtxn, &fst)?;
|
||||
Ok(true)
|
||||
@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
analyzer
|
||||
.analyze(text)
|
||||
.tokens()
|
||||
.filter_map(|token|
|
||||
if token.is_word() { Some(token.text().to_string()) } else { None }
|
||||
)
|
||||
.filter_map(|token| {
|
||||
if token.is_word() {
|
||||
Some(token.text().to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
@ -360,25 +357,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
for (word, synonyms) in synonyms {
|
||||
// Normalize both the word and associated synonyms.
|
||||
let normalized_word = normalize(&analyzer, word);
|
||||
let normalized_synonyms = synonyms
|
||||
.iter()
|
||||
.map(|synonym| normalize(&analyzer, synonym));
|
||||
let normalized_synonyms =
|
||||
synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
|
||||
|
||||
// Store the normalized synonyms under the normalized word,
|
||||
// merging the possible duplicate words.
|
||||
let entry = new_synonyms
|
||||
.entry(normalized_word)
|
||||
.or_insert_with(Vec::new);
|
||||
let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
|
||||
entry.extend(normalized_synonyms);
|
||||
}
|
||||
|
||||
// Make sure that we don't have duplicate synonyms.
|
||||
new_synonyms
|
||||
.iter_mut()
|
||||
.for_each(|(_, synonyms)| {
|
||||
synonyms.sort_unstable();
|
||||
synonyms.dedup();
|
||||
});
|
||||
new_synonyms.iter_mut().for_each(|(_, synonyms)| {
|
||||
synonyms.sort_unstable();
|
||||
synonyms.dedup();
|
||||
});
|
||||
|
||||
let old_synonyms = self.index.synonyms(self.wtxn)?;
|
||||
|
||||
@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
}
|
||||
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; }
|
||||
Setting::Reset => {
|
||||
self.index.delete_filterable_fields(self.wtxn)?;
|
||||
}
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
Ok(())
|
||||
@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.index.put_criteria(self.wtxn, &new_criteria)?;
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
}
|
||||
Setting::Reset => { self.index.delete_criteria(self.wtxn)?; }
|
||||
Setting::Reset => {
|
||||
self.index.delete_criteria(self.wtxn)?;
|
||||
}
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
Ok(())
|
||||
@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
} else {
|
||||
Err(UserError::PrimaryKeyCannotBeChanged.into())
|
||||
}
|
||||
},
|
||||
}
|
||||
Setting::Reset => {
|
||||
if self.index.number_of_documents(&self.wtxn)? == 0 {
|
||||
self.index.delete_primary_key(self.wtxn)?;
|
||||
@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
} else {
|
||||
Err(UserError::PrimaryKeyCannotBeReset.into())
|
||||
}
|
||||
},
|
||||
}
|
||||
Setting::NotSet => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep, u64) + Sync
|
||||
where
|
||||
F: Fn(UpdateIndexingStep, u64) + Sync,
|
||||
{
|
||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||
|
||||
@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::EnvOpenOptions;
|
||||
use heed::types::ByteSlice;
|
||||
use maplit::{btreeset, hashmap, hashset};
|
||||
use big_s::S;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::{btreeset, hashmap, hashset};
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::update::{IndexDocuments, UpdateFormat};
|
||||
use crate::{Criterion, FilterCondition, SearchResult};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn set_and_reset_searchable_fields() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
@ -674,7 +669,7 @@ mod tests {
|
||||
// Set the filterable fields to be the age.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_filterable_fields(hashset!{ S("age") });
|
||||
builder.set_filterable_fields(hashset! { S("age") });
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// Then index some documents.
|
||||
@ -692,12 +687,15 @@ mod tests {
|
||||
// Check that the displayed fields are correctly set.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.filterable_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids, hashset!{ S("age") });
|
||||
assert_eq!(fields_ids, hashset! { S("age") });
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
// TODO we must support typed CSVs for numbers to be understood.
|
||||
let count = index.facet_id_f64_docids
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
||||
.prefix_iter(&rtxn, &[0, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 3);
|
||||
drop(rtxn);
|
||||
|
||||
@ -718,9 +716,12 @@ mod tests {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
// TODO we must support typed CSVs for numbers to be understood.
|
||||
let count = index.facet_id_f64_docids
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
||||
.prefix_iter(&rtxn, &[0, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
}
|
||||
|
||||
@ -969,7 +970,7 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||
builder.set_displayed_fields(vec!["hello".to_string()]);
|
||||
builder.set_filterable_fields(hashset!{ S("age"), S("toto") });
|
||||
builder.set_filterable_fields(hashset! { S("age"), S("toto") });
|
||||
builder.set_criteria(vec!["asc(toto)".to_string()]);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -1,8 +1,8 @@
|
||||
use grenad::CompressionType;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings};
|
||||
use crate::{Index, Result};
|
||||
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
|
||||
|
||||
pub struct UpdateBuilder<'a> {
|
||||
pub(crate) log_every_n: Option<usize>,
|
||||
@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> ClearDocuments<'t, 'u, 'i>
|
||||
{
|
||||
) -> ClearDocuments<'t, 'u, 'i> {
|
||||
ClearDocuments::new(wtxn, index, self.update_id)
|
||||
}
|
||||
|
||||
@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> Result<DeleteDocuments<'t, 'u, 'i>>
|
||||
{
|
||||
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
|
||||
DeleteDocuments::new(wtxn, index, self.update_id)
|
||||
}
|
||||
|
||||
@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> IndexDocuments<'t, 'u, 'i, 'a>
|
||||
{
|
||||
) -> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
let mut builder = IndexDocuments::new(wtxn, index, self.update_id);
|
||||
|
||||
builder.log_every_n = self.log_every_n;
|
||||
@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> Settings<'a, 't, 'u, 'i>
|
||||
{
|
||||
) -> Settings<'a, 't, 'u, 'i> {
|
||||
let mut builder = Settings::new(wtxn, index, self.update_id);
|
||||
|
||||
builder.log_every_n = self.log_every_n;
|
||||
@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> Facets<'t, 'u, 'i>
|
||||
{
|
||||
) -> Facets<'t, 'u, 'i> {
|
||||
let mut builder = Facets::new(wtxn, index, self.update_id);
|
||||
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
|
@ -1,15 +1,13 @@
|
||||
use std::str;
|
||||
|
||||
use crate::Index;
|
||||
use fst::Streamer;
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use crate::Result;
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database,
|
||||
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
|
||||
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||
WordPrefixDocids {
|
||||
wtxn,
|
||||
index,
|
||||
|
@ -1,18 +1,17 @@
|
||||
use std::str;
|
||||
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use grenad::CompressionType;
|
||||
use heed::BytesEncode;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesEncode;
|
||||
use log::debug;
|
||||
|
||||
use crate::{Index, Result};
|
||||
use crate::heed_codec::StrStrU8Codec;
|
||||
use crate::update::index_documents::{
|
||||
WriteMethod, create_sorter, sorter_into_lmdb_database,
|
||||
cbo_roaring_bitmap_merge,
|
||||
cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
|
||||
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
|
||||
{
|
||||
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
WordPrefixPairProximityDocids {
|
||||
wtxn,
|
||||
index,
|
||||
|
@ -1,25 +1,23 @@
|
||||
use std::{cmp, str};
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::num::NonZeroU32;
|
||||
use std::{cmp, str};
|
||||
|
||||
use fst::automaton::{self, Automaton};
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use grenad::{CompressionType, FileFuse, Reader, Writer};
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||
use heed::{BytesEncode, Error};
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
|
||||
use crate::Result;
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
|
||||
use crate::update::index_documents::{
|
||||
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
|
||||
cbo_roaring_bitmap_merge, sorter_into_lmdb_database
|
||||
cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
|
||||
write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||
};
|
||||
use crate::{Index, TreeLevel};
|
||||
use crate::{Index, Result, TreeLevel};
|
||||
|
||||
pub struct WordsLevelPositions<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordsLevelPositions<'t, 'u, 'i> {
|
||||
WordsLevelPositions {
|
||||
wtxn,
|
||||
index,
|
||||
@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||
entries,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }),
|
||||
|_, _| {
|
||||
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })
|
||||
},
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
@ -176,13 +179,11 @@ fn compute_positions_levels(
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroU32,
|
||||
min_level_size: NonZeroU32,
|
||||
) -> Result<Reader<FileFuse>>
|
||||
{
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(compression_type, compression_level, file)
|
||||
})?;
|
||||
let mut writer = tempfile::tempfile()
|
||||
.and_then(|file| create_writer(compression_type, compression_level, file))?;
|
||||
|
||||
for result in words_db.iter(rtxn)? {
|
||||
let (word, ()) = result?;
|
||||
@ -193,7 +194,8 @@ fn compute_positions_levels(
|
||||
left..=right
|
||||
};
|
||||
|
||||
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
||||
let first_level_size = words_positions_db
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.range(rtxn, &level_0_range)?
|
||||
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
|
||||
|
||||
@ -253,8 +255,7 @@ fn write_level_entry(
|
||||
left: u32,
|
||||
right: u32,
|
||||
ids: &RoaringBitmap,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let key = (word, level, left, right);
|
||||
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
|
||||
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
|
||||
|
@ -2,7 +2,8 @@ use std::iter::FromIterator;
|
||||
use std::str;
|
||||
|
||||
use fst::Streamer;
|
||||
use crate::{Index, SmallString32, Result};
|
||||
|
||||
use crate::{Index, Result, SmallString32};
|
||||
|
||||
pub struct WordsPrefixesFst<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64,
|
||||
) -> WordsPrefixesFst<'t, 'u, 'i>
|
||||
{
|
||||
) -> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
WordsPrefixesFst {
|
||||
wtxn,
|
||||
index,
|
||||
@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
|
||||
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
||||
for n in 1..=self.max_prefix_length {
|
||||
|
||||
let mut current_prefix = SmallString32::new();
|
||||
let mut current_prefix_count = 0;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
@ -1,9 +1,8 @@
|
||||
use milli::{Criterion, Index, DocumentId};
|
||||
use milli::update::{IndexDocuments, UpdateFormat, Settings};
|
||||
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::{hashmap, hashset};
|
||||
use milli::update::{IndexDocuments, Settings, UpdateFormat};
|
||||
use milli::{Criterion, DocumentId, Index};
|
||||
use serde::Deserialize;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
@ -11,7 +10,8 @@ mod query_criteria;
|
||||
|
||||
pub const TEST_QUERY: &'static str = "hello world america";
|
||||
|
||||
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
|
||||
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] =
|
||||
&["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
|
||||
|
||||
pub const CONTENT: &str = include_str!("../assets/test_set.ndjson");
|
||||
|
||||
@ -27,16 +27,16 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
|
||||
let criteria = criteria.iter().map(|c| c.to_string()).collect();
|
||||
builder.set_criteria(criteria);
|
||||
builder.set_filterable_fields(hashset!{
|
||||
builder.set_filterable_fields(hashset! {
|
||||
S("tag"),
|
||||
S("asc_desc_rank"),
|
||||
});
|
||||
builder.set_synonyms(hashmap!{
|
||||
builder.set_synonyms(hashmap! {
|
||||
S("hello") => vec![S("good morning")],
|
||||
S("world") => vec![S("earth")],
|
||||
S("america") => vec![S("the united states")],
|
||||
});
|
||||
builder.set_searchable_fields(vec![S("title"),S("description")]);
|
||||
builder.set_searchable_fields(vec![S("title"), S("description")]);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// index documents
|
||||
@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> {
|
||||
let mut rtxn = index.read_txn().unwrap();
|
||||
let docid_map = index.external_documents_ids(&mut rtxn).unwrap();
|
||||
let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
|
||||
let docid_map: std::collections::HashMap<_, _> =
|
||||
EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
|
||||
internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect()
|
||||
}
|
||||
|
||||
pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec<TestDocument> {
|
||||
let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
|
||||
pub fn expected_order(
|
||||
criteria: &[Criterion],
|
||||
authorize_typo: bool,
|
||||
optional_words: bool,
|
||||
) -> Vec<TestDocument> {
|
||||
let dataset =
|
||||
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
|
||||
let mut groups: Vec<Vec<TestDocument>> = vec![dataset];
|
||||
|
||||
for criterion in criteria {
|
||||
@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor
|
||||
match criterion {
|
||||
Criterion::Attribute => {
|
||||
group.sort_by_key(|d| d.attribute_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
|
||||
},
|
||||
new_groups
|
||||
.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
|
||||
}
|
||||
Criterion::Exactness => {
|
||||
group.sort_by_key(|d| d.exact_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from));
|
||||
},
|
||||
}
|
||||
Criterion::Proximity => {
|
||||
group.sort_by_key(|d| d.proximity_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
|
||||
},
|
||||
new_groups
|
||||
.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
|
||||
}
|
||||
Criterion::Typo => {
|
||||
group.sort_by_key(|d| d.typo_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from));
|
||||
},
|
||||
}
|
||||
Criterion::Words => {
|
||||
group.sort_by_key(|d| d.word_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from));
|
||||
},
|
||||
}
|
||||
Criterion::Asc(field_name) if field_name == "asc_desc_rank" => {
|
||||
group.sort_by_key(|d| d.asc_desc_rank);
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
|
||||
},
|
||||
Criterion::Desc(field_name) if field_name == "asc_desc_rank" => {
|
||||
new_groups
|
||||
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
|
||||
}
|
||||
Criterion::Desc(field_name) if field_name == "asc_desc_rank" => {
|
||||
group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank));
|
||||
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
|
||||
},
|
||||
new_groups
|
||||
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
|
||||
}
|
||||
Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()),
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
use big_s::S;
|
||||
use milli::update::Settings;
|
||||
use milli::{Search, SearchResult, Criterion};
|
||||
use milli::{Criterion, Search, SearchResult};
|
||||
use Criterion::*;
|
||||
|
||||
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
|
||||
use Criterion::*;
|
||||
|
||||
const ALLOW_TYPOS: bool = true;
|
||||
const DISALLOW_TYPOS: bool = false;
|
||||
@ -35,29 +35,54 @@ macro_rules! test_criterion {
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity);
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field")));
|
||||
#[rustfmt::skip]
|
||||
test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field")));
|
||||
|
||||
#[test]
|
||||
fn criteria_mixup() {
|
||||
use Criterion::*;
|
||||
let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]);
|
||||
let index = search::setup_search_index_with_criteria(&vec![
|
||||
Words,
|
||||
Attribute,
|
||||
Desc(S("asc_desc_rank")),
|
||||
Exactness,
|
||||
Proximity,
|
||||
Typo,
|
||||
]);
|
||||
|
||||
#[rustfmt::skip]
|
||||
let criteria_mix = {
|
||||
// Criterion doesn't implement Copy, we create a new Criterion using a closure
|
||||
let desc = || Desc(S("asc_desc_rank"));
|
||||
@ -205,10 +230,11 @@ fn criteria_mixup() {
|
||||
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
||||
let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS)
|
||||
.into_iter()
|
||||
.map(|d| d.id)
|
||||
.collect();
|
||||
let expected_external_ids: Vec<_> =
|
||||
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS)
|
||||
.into_iter()
|
||||
.map(|d| d.id)
|
||||
.collect();
|
||||
let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
|
||||
|
||||
assert_eq!(documents_ids, expected_external_ids);
|
||||
|
36
script/pre-commit
Executable file
36
script/pre-commit
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
cargo check --workspace --all-targets &>/dev/null
|
||||
result=$?
|
||||
|
||||
if [[ ${result} -ne 0 ]] ; then
|
||||
cat <<\EOF
|
||||
The project does not compile. You might want to fix your error before commiting.
|
||||
|
||||
If you still want to commit you can do it by appending
|
||||
--no-verify
|
||||
at the end of your previous command.
|
||||
|
||||
If you are running a variant of bash you can directly paste this command in your terminal:
|
||||
!! --no-verify
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cargo fmt --all -- --check &>/dev/null
|
||||
result=$?
|
||||
|
||||
if [[ ${result} -ne 0 ]] ; then
|
||||
cat <<\EOF
|
||||
The project is badly formatted. Please run:
|
||||
cargo fmt --all
|
||||
|
||||
If you want to create your commit without propper formatting you can add
|
||||
--no-verify
|
||||
at the end of your commit.
|
||||
|
||||
If you are running a variant of bash you can directly paste this command in your terminal:
|
||||
!! --no-verify
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
@ -6,10 +6,9 @@ use std::time::Instant;
|
||||
use byte_unit::Byte;
|
||||
use heed::EnvOpenOptions;
|
||||
use log::debug;
|
||||
use milli::{obkv_to_json, Index};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use milli::{Index, obkv_to_json};
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if opt.print_facet_distribution {
|
||||
let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
|
||||
let facets =
|
||||
index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
|
||||
serde_json::to_writer(&mut stdout, &facets)?;
|
||||
let _ = writeln!(&mut stdout);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user