236: Format the whole project r=Kerollmops a=irevoire

I need to add `cargo fmt` in the CI before closing #231

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2021-06-16 18:05:40 +00:00 committed by GitHub
commit 1bcf43baac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
68 changed files with 3327 additions and 2336 deletions

5
.rustfmt.toml Normal file
View File

@ -0,0 +1,5 @@
unstable_features = true
use_small_heuristics = "max"
imports_granularity = "Module"
group_imports = "StdExternalCrate"

View File

@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head
### Querying the engine via the website
You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700).
## Contributing
You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if:
- Any of the workspaces does not build
- Your code is not well-formatted
These two things are also checked in the CI, so ignoring the hook won't help you merge your code.
But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook.
To enable the hook, run the following command from the root of the project:
```
cp script/pre-commit .git/hooks/pre-commit
```

View File

@ -6,33 +6,24 @@ use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = [
"id", "title", "album", "artist", "genre", "country", "released", "duration",
]
let displayed_fields =
["id", "title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = [
"released-timestamp",
"duration-float",
"genre",
"country",
"artist",
]
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_filterable_fields(faceted_fields);
}
#[rustfmt::skip]
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_SONGS,
queries: &[
@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf {
};
fn bench_songs(c: &mut criterion::Criterion) {
let default_criterion: Vec<String> = milli::default_criteria()
.iter()
.map(|criteria| criteria.to_string())
.collect();
let default_criterion: Vec<String> =
milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect();
let default_criterion = default_criterion.iter().map(|s| s.as_str());
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let asc_default: Vec<&str> =
std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect();
let desc_default: Vec<&str> =
std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect();
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let basic_with_quote: &[&str] =
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
#[rustfmt::skip]
let confs = &[
/* first we bench each criterion alone */
utils::Conf {

View File

@ -3,10 +3,8 @@ use std::path::Path;
use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::{
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
FilterCondition, Index,
};
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
use milli::{FilterCondition, Index};
pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory

View File

@ -6,16 +6,14 @@ use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = ["title", "body", "url"]
.iter()
.map(|s| s.to_string())
.collect();
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
}
#[rustfmt::skip]
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
queries: &[
@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) {
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let basic_with_quote: &[&str] =
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
#[rustfmt::skip]
let confs = &[
/* first we bench each criterion alone */
utils::Conf {

View File

@ -1,9 +1,7 @@
use std::fs::File;
use std::io::{Cursor, Read, Seek, Write};
use std::path::{Path, PathBuf};
use std::{env, fs};
use std::{
fs::File,
io::{Cursor, Read, Seek, Write},
};
use bytes::Bytes;
use convert_case::{Case, Casing};
@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> {
)?;
if out_file.exists() {
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
eprintln!(
"The dataset {} already exists on the file system and will not be downloaded again",
dataset
);
continue;
}
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> {
}
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
let bytes = reqwest::blocking::Client::builder()
.timeout(None)
.build()?
.get(url)
.send()?
.bytes()?;
let bytes =
reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?;
Ok(Cursor::new(bytes))
}

View File

@ -1,9 +1,8 @@
use std::path::PathBuf;
use byte_unit::Byte;
use heed::{Env, EnvOpenOptions, CompactionOption};
use heed::{CompactionOption, Env, EnvOpenOptions};
use structopt::StructOpt;
use Command::*;
#[cfg(target_os = "linux")]
@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> {
use CompactionOption::*;
let compaction = if enable_compaction { Enabled } else { Disabled };
copy_main_database_to_stdout(env, compaction)
},
}
}
}

View File

@ -1,6 +1,5 @@
mod update_store;
use std::{io, mem};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::Display;
use std::fs::{create_dir_all, File};
@ -10,16 +9,19 @@ use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;
use std::{io, mem};
use askama_warp::Template;
use byte_unit::Byte;
use either::Either;
use flate2::read::GzDecoder;
use futures::{FutureExt, StreamExt};
use futures::stream;
use futures::{stream, FutureExt, StreamExt};
use grenad::CompressionType;
use heed::EnvOpenOptions;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::update::UpdateIndexingStep::*;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult};
use once_cell::sync::OnceCell;
use rayon::ThreadPool;
use serde::{Deserialize, Serialize};
@ -28,12 +30,9 @@ use structopt::StructOpt;
use tokio::fs::File as TFile;
use tokio::io::AsyncWriteExt;
use tokio::sync::broadcast;
use warp::{Filter, http::Response};
use warp::filters::ws::Message;
use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult};
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::update::UpdateIndexingStep::*;
use warp::http::Response;
use warp::Filter;
use self::update_store::UpdateStore;
@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
if to_highlight { string.push_str("<mark>") }
if to_highlight {
string.push_str("<mark>")
}
string.push_str(word);
if to_highlight { string.push_str("</mark>") }
if to_highlight {
string.push_str("</mark>")
}
} else {
string.push_str(word);
}
}
Value::String(string)
}
Value::Array(values) => {
Value::Array(values.into_iter()
.map(|v| self.highlight_value(v, matching_words))
.collect())
}
Value::Object(object) => {
Value::Object(object.into_iter()
Value::Array(values) => Value::Array(
values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(),
),
Value::Object(object) => Value::Object(
object
.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, matching_words)))
.collect())
}
.collect(),
),
}
}
@ -236,12 +238,7 @@ enum UpdateMeta {
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
enum UpdateMetaProgress {
DocumentsAddition {
step: usize,
total_steps: usize,
current: usize,
total: Option<usize>,
},
DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option<usize> },
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@ -342,11 +339,13 @@ async fn main() -> anyhow::Result<()> {
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size);
update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type);
update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
update_builder
.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
let before_update = Instant::now();
// we extract the update type and execute the update itself.
let result: anyhow::Result<()> = match meta {
let result: anyhow::Result<()> =
match meta {
UpdateMeta::DocumentsAddition { method, format, encoding } => {
// We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?;
@ -360,8 +359,10 @@ async fn main() -> anyhow::Result<()> {
};
match method.as_str() {
"replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
"update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
"replace" => builder
.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
"update" => builder
.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
otherwise => panic!("invalid indexing method {:?}", otherwise),
};
@ -373,10 +374,18 @@ async fn main() -> anyhow::Result<()> {
let result = builder.execute(reader, |indexing_step, update_id| {
let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
TransformFromUserIntoGenericFormat { documents_seen } => {
(documents_seen, None)
}
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
IndexDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
(databases_seen, Some(total_databases))
}
};
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id,
@ -411,52 +420,66 @@ async fn main() -> anyhow::Result<()> {
// We transpose the settings JSON struct into a real setting update.
match settings.searchable_attributes {
Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes),
Setting::Set(searchable_attributes) => {
builder.set_searchable_fields(searchable_attributes)
}
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => ()
Setting::NotSet => (),
}
// We transpose the settings JSON struct into a real setting update.
match settings.displayed_attributes {
Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes),
Setting::Set(displayed_attributes) => {
builder.set_displayed_fields(displayed_attributes)
}
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => ()
Setting::NotSet => (),
}
// We transpose the settings JSON struct into a real setting update.
match settings.filterable_attributes {
Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes),
Setting::Set(filterable_attributes) => {
builder.set_filterable_fields(filterable_attributes)
}
Setting::Reset => builder.reset_filterable_fields(),
Setting::NotSet => ()
Setting::NotSet => (),
}
// We transpose the settings JSON struct into a real setting update.
match settings.criteria {
Setting::Set(criteria) => builder.set_criteria(criteria),
Setting::Reset => builder.reset_criteria(),
Setting::NotSet => ()
Setting::NotSet => (),
}
// We transpose the settings JSON struct into a real setting update.
match settings.stop_words {
Setting::Set(stop_words) => builder.set_stop_words(stop_words),
Setting::Reset => builder.reset_stop_words(),
Setting::NotSet => ()
Setting::NotSet => (),
}
// We transpose the settings JSON struct into a real setting update.
match settings.synonyms {
Setting::Set(synonyms) => builder.set_synonyms(synonyms),
Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => ()
Setting::NotSet => (),
}
let result = builder.execute(|indexing_step, update_id| {
let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
TransformFromUserIntoGenericFormat { documents_seen } => {
(documents_seen, None)
}
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
IndexDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
(databases_seen, Some(total_databases))
}
};
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id,
@ -492,7 +515,9 @@ async fn main() -> anyhow::Result<()> {
};
let meta = match result {
Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()),
Ok(()) => {
format!("valid update content processed in {:.02?}", before_update.elapsed())
}
Err(e) => format!("error while processing update content: {:?}", e),
};
@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> {
let _ = update_status_sender_cloned.send(processed);
Ok(meta)
})?;
},
)?;
// The database name will not change.
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> {
let db_name_cloned = db_name.clone();
let lmdb_path_cloned = lmdb_path.clone();
let index_cloned = index.clone();
let dash_html_route = warp::filters::method::get()
.and(warp::filters::path::end())
.map(move || {
let dash_html_route =
warp::filters::method::get().and(warp::filters::path::end()).map(move || {
// We retrieve the database size.
let db_size = File::open(lmdb_path_cloned.clone())
.unwrap()
.metadata()
.unwrap()
.len() as usize;
let db_size =
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize;
// And the number of documents in the database.
let rtxn = index_cloned.read_txn().unwrap();
@ -537,7 +559,8 @@ async fn main() -> anyhow::Result<()> {
.and(warp::path!("updates"))
.map(move |header: String| {
let update_store = update_store_cloned.clone();
let mut updates = update_store.iter_metas(|processed, aborted, pending| {
let mut updates = update_store
.iter_metas(|processed, aborted, pending| {
let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new();
for result in processed {
let (uid, meta) = result?;
@ -552,96 +575,89 @@ async fn main() -> anyhow::Result<()> {
updates.push(UpdateStatus::Pending { update_id: uid.get(), meta });
}
Ok(updates)
}).unwrap();
})
.unwrap();
updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse());
if header.contains("text/html") {
// We retrieve the database size.
let db_size = File::open(lmdb_path_cloned.clone())
.unwrap()
.metadata()
.unwrap()
.len() as usize;
let db_size =
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len()
as usize;
// And the number of documents in the database.
let rtxn = index_cloned.read_txn().unwrap();
let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize;
let template = UpdatesTemplate {
db_name: db_name.clone(),
db_size,
docs_count,
updates,
};
let template =
UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates };
Box::new(template) as Box<dyn warp::Reply>
} else {
Box::new(warp::reply::json(&updates))
}
});
let dash_bulma_route = warp::filters::method::get()
.and(warp::path!("bulma.min.css"))
.map(|| Response::builder()
let dash_bulma_route =
warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| {
Response::builder()
.header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/bulma.min.css"))
);
});
let dash_bulma_dark_route = warp::filters::method::get()
.and(warp::path!("bulma-prefers-dark.min.css"))
.map(|| Response::builder()
let dash_bulma_dark_route =
warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| {
Response::builder()
.header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/bulma-prefers-dark.min.css"))
);
});
let dash_style_route = warp::filters::method::get()
.and(warp::path!("style.css"))
.map(|| Response::builder()
let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| {
Response::builder()
.header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/style.css"))
);
});
let dash_jquery_route = warp::filters::method::get()
.and(warp::path!("jquery-3.4.1.min.js"))
.map(|| Response::builder()
let dash_jquery_route =
warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| {
Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/jquery-3.4.1.min.js"))
);
});
let dash_filesize_route = warp::filters::method::get()
.and(warp::path!("filesize.min.js"))
.map(|| Response::builder()
let dash_filesize_route =
warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| {
Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/filesize.min.js"))
);
});
let dash_script_route = warp::filters::method::get()
.and(warp::path!("script.js"))
.map(|| Response::builder()
let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| {
Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/script.js"))
);
});
let updates_script_route = warp::filters::method::get()
.and(warp::path!("updates-script.js"))
.map(|| Response::builder()
let updates_script_route =
warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| {
Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/updates-script.js"))
);
});
let dash_logo_white_route = warp::filters::method::get()
.and(warp::path!("logo-white.svg"))
.map(|| Response::builder()
let dash_logo_white_route =
warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| {
Response::builder()
.header("content-type", "image/svg+xml")
.body(include_str!("../public/logo-white.svg"))
);
});
let dash_logo_black_route = warp::filters::method::get()
.and(warp::path!("logo-black.svg"))
.map(|| Response::builder()
let dash_logo_black_route =
warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| {
Response::builder()
.header("content-type", "image/svg+xml")
.body(include_str!("../public/logo-black.svg"))
);
});
#[derive(Debug, Deserialize)]
#[serde(untagged)]
@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> {
search.filter(condition);
}
let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
let SearchResult { matching_words, candidates, documents_ids } =
search.execute().unwrap();
let number_of_candidates = candidates.len();
let facets = if query.facet_distribution == Some(true) {
@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> {
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting {
highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
highlighter.highlight_record(
&mut object,
&matching_words,
&attributes_to_highlight,
);
}
documents.push(object);
}
let answer = Answer {
documents,
number_of_candidates,
facets: facets.unwrap_or_default(),
};
let answer =
Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() };
Response::builder()
.header("Content-Type", "application/json")
@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> {
});
let index_cloned = index.clone();
let document_route = warp::filters::method::get()
.and(warp::path!("document" / String))
.map(move |id: String| {
let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map(
move |id: String| {
let index = index_cloned.clone();
let rtxn = index.read_txn().unwrap();
@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> {
match external_documents_ids.get(&id) {
Some(document_id) => {
let document_id = document_id as u32;
let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
let (_, obkv) =
index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
Response::builder()
.header("Content-Type", "application/json")
.body(serde_json::to_string(&document).unwrap())
}
None => {
Response::builder()
None => Response::builder()
.status(404)
.body(format!("Document with id {:?} not found.", id))
.body(format!("Document with id {:?} not found.", id)),
}
}
});
},
);
async fn buf_stream(
update_store: Arc<UpdateStore<UpdateMeta, String>>,
update_status_sender: broadcast::Sender<UpdateStatus<UpdateMeta, UpdateMetaProgress, String>>,
update_status_sender: broadcast::Sender<
UpdateStatus<UpdateMeta, UpdateMetaProgress, String>,
>,
update_method: Option<String>,
update_format: UpdateFormat,
encoding: Option<String>,
mut stream: impl futures::Stream<Item=Result<impl bytes::Buf, warp::Error>> + Unpin,
) -> Result<impl warp::Reply, warp::Rejection>
{
mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin,
) -> Result<impl warp::Reply, warp::Rejection> {
let file = tokio::task::block_in_place(tempfile::tempfile).unwrap();
let mut file = TFile::from_std(file);
@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> {
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let clearing_route = warp::filters::method::post()
.and(warp::path!("clear-documents"))
.map(move || {
let clearing_route =
warp::filters::method::post().and(warp::path!("clear-documents")).map(move || {
let meta = UpdateMeta::ClearDocuments;
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> {
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let abort_pending_updates_route = warp::filters::method::delete()
.and(warp::path!("updates"))
.map(move || {
let abort_pending_updates_route =
warp::filters::method::delete().and(warp::path!("updates")).map(move || {
let updates = update_store_cloned.abort_pendings().unwrap();
for (update_id, meta) in updates {
let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta });
@ -930,17 +946,15 @@ async fn main() -> anyhow::Result<()> {
warp::reply()
});
let update_ws_route = warp::ws()
.and(warp::path!("updates" / "ws"))
.map(move |ws: warp::ws::Ws| {
let update_ws_route =
warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| {
// And then our closure will be called when it completes...
let update_status_receiver = update_status_sender.subscribe();
ws.on_upgrade(|websocket| {
// Just echo all updates messages...
update_status_receiver
.into_stream()
.flat_map(|result| {
match result {
.flat_map(|result| match result {
Ok(status) => {
let msg = serde_json::to_string(&status).unwrap();
stream::iter(Some(Ok(Message::text(msg))))
@ -949,7 +963,6 @@ async fn main() -> anyhow::Result<()> {
eprintln!("channel error: {:?}", e);
stream::iter(None)
}
}
})
.forward(websocket)
.map(|result| {
@ -988,10 +1001,9 @@ async fn main() -> anyhow::Result<()> {
#[cfg(test)]
mod tests {
use maplit::{btreeset,hashmap, hashset};
use serde_test::{assert_tokens, Token};
use maplit::{btreeset, hashmap, hashset};
use milli::update::Setting;
use serde_test::{assert_tokens, Token};
use crate::Settings;
@ -1000,13 +1012,15 @@ mod tests {
let settings = Settings {
displayed_attributes: Setting::Set(vec!["name".to_string()]),
searchable_attributes: Setting::Set(vec!["age".to_string()]),
filterable_attributes: Setting::Set(hashset!{ "age".to_string() }),
filterable_attributes: Setting::Set(hashset! { "age".to_string() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] })
synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }),
};
assert_tokens(&settings, &[
assert_tokens(
&settings,
&[
Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"),
Token::Some,
@ -1038,12 +1052,13 @@ mod tests {
Token::Some,
Token::Map { len: Some(1) },
Token::Str("alex"),
Token::Seq {len: Some(1) },
Token::Seq { len: Some(1) },
Token::Str("alexey"),
Token::SeqEnd,
Token::MapEnd,
Token::StructEnd,
]);
],
);
}
#[test]
@ -1057,7 +1072,9 @@ mod tests {
synonyms: Setting::Reset,
};
assert_tokens(&settings, &[
assert_tokens(
&settings,
&[
Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"),
Token::None,
@ -1072,7 +1089,8 @@ mod tests {
Token::Str("synonyms"),
Token::None,
Token::StructEnd,
]);
],
);
}
#[test]
@ -1086,9 +1104,6 @@ mod tests {
synonyms: Setting::NotSet,
};
assert_tokens(&settings, &[
Token::Struct { name: "Settings", len: 0 },
Token::StructEnd,
]);
assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]);
}
}

View File

@ -4,9 +4,9 @@ use std::path::Path;
use std::sync::Arc;
use crossbeam_channel::Sender;
use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice};
use heed::{EnvOpenOptions, Env, Database};
use serde::{Serialize, Deserialize};
use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
use heed::{Database, Env, EnvOpenOptions};
use serde::{Deserialize, Serialize};
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
@ -25,7 +25,9 @@ pub trait UpdateHandler<M, N> {
}
impl<M, N, F> UpdateHandler<M, N> for F
where F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static {
where
F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static,
{
fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> {
self(update_id, meta, content)
}
@ -82,26 +84,17 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Returns the new biggest id to use to store the new update.
fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> {
let last_pending = self.pending_meta
.remap_data_type::<DecodeIgnore>()
.last(txn)?
.map(|(k, _)| k.get());
let last_pending =
self.pending_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
let last_processed = self.processed_meta
.remap_data_type::<DecodeIgnore>()
.last(txn)?
.map(|(k, _)| k.get());
let last_processed =
self.processed_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
let last_aborted = self.aborted_meta
.remap_data_type::<DecodeIgnore>()
.last(txn)?
.map(|(k, _)| k.get());
let last_aborted =
self.aborted_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
let last_update_id = [last_pending, last_processed, last_aborted]
.iter()
.copied()
.flatten()
.max();
let last_update_id =
[last_pending, last_processed, last_aborted].iter().copied().flatten().max();
match last_update_id {
Some(last_id) => Ok(last_id + 1),
@ -112,7 +105,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Registers the update content in the pending store and the meta
/// into the pending-meta store. Returns the new unique update id.
pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64>
where M: Serialize,
where
M: Serialize,
{
let mut wtxn = self.env.write_txn()?;
@ -152,9 +146,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
// a reader while processing it, not a writer.
match first_meta {
Some((first_id, first_meta)) => {
let first_content = self.pending
.get(&rtxn, &first_id)?
.expect("associated update content");
let first_content =
self.pending.get(&rtxn, &first_id)?.expect("associated update content");
// Process the pending update using the provided user function.
let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?;
@ -170,15 +163,16 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
wtxn.commit()?;
Ok(Some((first_id.get(), new_meta)))
},
None => Ok(None)
}
None => Ok(None),
}
}
/// The id and metadata of the update that is currently being processed,
/// `None` if no update is being processed.
pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>>
where M: for<'a> Deserialize<'a>,
where
M: for<'a> Deserialize<'a>,
{
let rtxn = self.env.read_txn()?;
match self.pending_meta.first(&rtxn)? {
@ -242,7 +236,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// that as already been processed or which doesn't actually exist, will
/// return `None`.
pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>>
where M: Serialize + for<'a> Deserialize<'a>,
where
M: Serialize + for<'a> Deserialize<'a>,
{
let mut wtxn = self.env.write_txn()?;
let key = BEU64::new(update_id);
@ -269,7 +264,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Aborts all the pending updates, and not the one being currently processed.
/// Returns the update metas and ids that were successfully aborted.
pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>>
where M: Serialize + for<'a> Deserialize<'a>,
where
M: Serialize + for<'a> Deserialize<'a>,
{
let mut wtxn = self.env.write_txn()?;
let mut aborted_updates = Vec::new();
@ -303,17 +299,19 @@ pub enum UpdateStatusMeta<M, N> {
#[cfg(test)]
mod tests {
use super::*;
use std::thread;
use std::time::{Duration, Instant};
use super::*;
#[test]
fn simple() {
let dir = tempfile::tempdir().unwrap();
let options = EnvOpenOptions::new();
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| {
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
Ok(meta + " processed")
}).unwrap();
})
.unwrap();
let meta = String::from("kiki");
let update_id = update_store.register_update(&meta, &[]).unwrap();
@ -329,10 +327,11 @@ mod tests {
fn long_running_update() {
let dir = tempfile::tempdir().unwrap();
let options = EnvOpenOptions::new();
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| {
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
thread::sleep(Duration::from_millis(400));
Ok(meta + " processed")
}).unwrap();
})
.unwrap();
let before_register = Instant::now();

View File

@ -1,16 +1,14 @@
use std::fmt::Write as _;
use std::path::PathBuf;
use std::{str, io, fmt};
use std::{fmt, io, str};
use anyhow::Context;
use byte_unit::Byte;
use heed::EnvOpenOptions;
use structopt::StructOpt;
use milli::facet::FacetType;
use milli::index::db_name::*;
use milli::{Index, TreeLevel};
use structopt::StructOpt;
use Command::*;
#[cfg(target_os = "linux")]
@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> {
WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words),
WordsPrefixesDocids { full_display, prefixes } => {
words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
},
}
FacetNumbersDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name)
},
}
FacetStringsDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name)
},
}
WordsLevelPositionsDocids { full_display, words } => {
words_level_positions_docids(&index, &rtxn, !full_display, words)
},
}
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
},
}
FieldIdWordCountDocids { full_display, field_name } => {
field_id_word_count_docids(&index, &rtxn, !full_display, field_name)
},
}
DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
},
}
FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => {
average_number_of_positions_by_word(&index, &rtxn)
},
AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn),
SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases),
DatabaseStats { database } => database_stats(&index, &rtxn, &database),
WordPairProximitiesDocids { full_display, word1, word2 } => {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
},
}
ExportWordsFst => export_words_fst(&index, &rtxn),
ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn),
ExportDocuments { internal_documents_ids } => {
export_documents(&index, &rtxn, internal_documents_ids)
},
}
}
}
fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::collections::BinaryHeap;
use std::cmp::Reverse;
use std::collections::BinaryHeap;
let mut heap = BinaryHeap::with_capacity(limit + 1);
for result in index.word_docids.iter(rtxn)? {
if limit == 0 { break }
if limit == 0 {
break;
}
let (word, docids) = result?;
heap.push((Reverse(docids.len()), word));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
let stdout = io::stdout();
@ -323,7 +323,7 @@ fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
rtxn: &'txn heed::RoTxn,
db: heed::Database<KC, DC>,
field_id: u8,
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
where
KC: heed::BytesDecode<'txn>,
DC: heed::BytesDecode<'txn>,
@ -347,7 +347,8 @@ fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) ->
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use heed::types::{Str, ByteSlice};
use heed::types::{ByteSlice, Str};
let Index {
env: _env,
@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let words_fst = index.words_fst(rtxn)?;
let length = words_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-fst"), main_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
// Fetch the word prefix FST
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
let length = words_prefixes_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-prefixes-fst"), main_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((docid, word), value) = result?;
let key = format!("{} {}", docid, word);
heap.push(Reverse((value.len(), key, docid_word_positions_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word1, word2, prox), value) = result?;
let key = format!("{} {} {}", word1, word2, prox);
heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, prefix, prox), value) = result?;
let key = format!("{} {} {}", word, prefix, prox);
heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((field_id, word_count), docids) = result?;
let key = format!("{} {}", field_id, word_count);
heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
let faceted_fields = index.faceted_fields_ids(rtxn)?;
@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
write!(&mut output, " (level {})", level)?;
let key = format!("{} {}", facet_name, output);
heap.push(Reverse((value.len(), key, facet_id_f64_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
// List the facet strings of this facet id.
@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let ((_fid, fvalue), value) = result?;
let key = format!("{} {}", facet_name, fvalue);
heap.push(Reverse((value.len(), key, facet_id_string_docids_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
}
for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (id, value) = result?;
heap.push(Reverse((value.len(), id.to_string(), documents_name)));
if heap.len() > limit { heap.pop(); }
if heap.len() > limit {
heap.pop();
}
}
}
@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
Ok(wtr.flush()?)
}
fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> {
fn words_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
words: Vec<String>,
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "documents_ids"])?;
@ -523,8 +557,7 @@ fn words_prefixes_docids(
rtxn: &heed::RoTxn,
debug: bool,
prefixes: Vec<String>,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["prefix", "documents_ids"])?;
@ -561,12 +594,12 @@ fn facet_values_docids(
debug: bool,
facet_type: FacetType,
field_name: String,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
if !faceted_fields.contains(&field_id) {
@ -590,7 +623,7 @@ fn facet_values_docids(
};
wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?;
}
},
}
FacetType::String => {
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
@ -614,8 +647,7 @@ fn words_level_positions_docids(
rtxn: &heed::RoTxn,
debug: bool,
words: Vec<String>,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids(
rtxn: &heed::RoTxn,
debug: bool,
prefixes: Vec<String>,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
@ -691,21 +722,20 @@ fn field_id_word_count_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
field_name: String
) -> anyhow::Result<()>
{
field_name: String,
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["field_name", "word_count", "docids"])?;
let field_id = index.fields_ids_map(rtxn)?
let field_id = index
.fields_ids_map(rtxn)?
.id(&field_name)
.with_context(|| format!("unknown field name: {}", &field_name))?;
let left = (field_id, 0);
let right = (field_id, u8::max_value());
let iter = index.field_id_word_count_docids
.range(rtxn, &(left..=right))?;
let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?;
for result in iter {
let ((_, word_count), docids) = result?;
@ -725,8 +755,7 @@ fn docids_words_positions(
rtxn: &heed::RoTxn,
debug: bool,
internal_ids: Vec<u32>,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["document_id", "word", "positions"])?;
@ -734,9 +763,10 @@ fn docids_words_positions(
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
Box::new(index.docid_word_positions.iter(rtxn)?)
} else {
let vec: heed::Result<Vec<_>> = internal_ids.into_iter().map(|id| {
index.docid_word_positions.prefix_iter(rtxn, &(id, ""))
}).collect();
let vec: heed::Result<Vec<_>> = internal_ids
.into_iter()
.map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, "")))
.collect();
Box::new(vec?.into_iter().flatten())
};
@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) ->
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
if !faceted_fields.contains(&field_id) {
@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<
Ok(())
}
fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -> anyhow::Result<()> {
fn export_documents(
index: &Index,
rtxn: &heed::RoTxn,
internal_ids: Vec<u32>,
) -> anyhow::Result<()> {
use std::io::{BufWriter, Write as _};
use milli::{BEU32, obkv_to_json};
use milli::{obkv_to_json, BEU32};
let stdout = io::stdout();
let mut out = BufWriter::new(stdout);
@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect();
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
Box::new(index.documents.iter(rtxn)?.map(|result| {
result.map(|(_id, obkv)| obkv)
}))
Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv)))
} else {
Box::new(internal_ids.into_iter().flat_map(|id| {
index.documents.get(rtxn, &BEU32::new(id)).transpose()
}))
Box::new(
internal_ids
.into_iter()
.flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()),
)
};
for result in iter {
@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
use heed::types::DecodeIgnore;
use milli::{DocumentId, BEU32StrCodec};
use milli::{BEU32StrCodec, DocumentId};
let mut words_counts = Vec::new();
let mut count = 0;
let mut prev = None as Option<(DocumentId, u32)>;
let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
let iter =
index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
for result in iter {
let ((docid, _word), ()) = result?;
match prev.as_mut() {
Some((prev_docid, prev_count)) if docid == *prev_docid => {
*prev_count += 1;
},
}
Some((prev_docid, prev_count)) => {
words_counts.push(*prev_count);
*prev_docid = docid;
*prev_count = 0;
count += 1;
},
}
None => prev = Some((docid, 1)),
}
}
@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> {
use heed::types::ByteSlice;
use heed::{Error, BytesDecode};
use roaring::RoaringBitmap;
use heed::{BytesDecode, Error};
use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
use roaring::RoaringBitmap;
fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>(
db: heed::PolyDatabase,
rtxn: &'a heed::RoTxn,
name: &str,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
let mut key_size = 0u64;
let mut val_size = 0u64;
let mut values_length = Vec::new();
@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
WORD_DOCIDS => {
let db = index.word_docids.as_polymorph();
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
},
}
WORD_PREFIX_DOCIDS => {
let db = index.word_prefix_docids.as_polymorph();
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
},
}
DOCID_WORD_POSITIONS => {
let db = index.docid_word_positions.as_polymorph();
compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name)
},
}
WORD_PAIR_PROXIMITY_DOCIDS => {
let db = index.word_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
}
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => {
let db = index.word_prefix_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
}
FIELD_ID_WORD_COUNT_DOCIDS => {
let db = index.field_id_word_count_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
}
unknown => anyhow::bail!("unknown database {:?}", unknown),
}
}
@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids(
debug: bool,
word1: String,
word2: String,
) -> anyhow::Result<()>
{
) -> anyhow::Result<()> {
use heed::types::ByteSlice;
use milli::RoaringBitmapCodec;
@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids(
// Skip keys that are longer than the requested one,
// a longer key means that the second word is a prefix of the request word.
if key.len() != prefix.len() + 1 { continue; }
if key.len() != prefix.len() + 1 {
continue;
}
let proximity = key.last().unwrap();
let docids = if debug {

View File

@ -1,15 +1,14 @@
use std::fmt;
use std::str::FromStr;
use regex::Regex;
use serde::{Serialize, Deserialize};
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::error::{Error, UserError};
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()
});
static ASC_DESC_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap());
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub enum Criterion {
@ -52,17 +51,21 @@ impl FromStr for Criterion {
"attribute" => Ok(Criterion::Attribute),
"exactness" => Ok(Criterion::Exactness),
text => {
let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| {
UserError::InvalidCriterionName { name: text.to_string() }
})?;
let caps = ASC_DESC_REGEX
.captures(text)
.ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?;
let order = caps.get(1).unwrap().as_str();
let field_name = caps.get(2).unwrap().as_str();
match order {
"asc" => Ok(Criterion::Asc(field_name.to_string())),
"desc" => Ok(Criterion::Desc(field_name.to_string())),
text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()),
text => {
return Err(
UserError::InvalidCriterionName { name: text.to_string() }.into()
)
}
}
}
},
}
}
}

View File

@ -2,7 +2,7 @@ use std::convert::Infallible;
use std::error::Error as StdError;
use std::{fmt, io, str};
use heed::{MdbError, Error as HeedError};
use heed::{Error as HeedError, MdbError};
use rayon::ThreadPoolBuildError;
use serde_json::{Map, Value};
@ -80,14 +80,17 @@ impl From<fst::Error> for Error {
}
}
impl<E> From<grenad::Error<E>> for Error where Error: From<E> {
impl<E> From<grenad::Error<E>> for Error
where
Error: From<E>,
{
fn from(error: grenad::Error<E>) -> Error {
match error {
grenad::Error::Io(error) => Error::IoError(error),
grenad::Error::Merge(error) => Error::from(error),
grenad::Error::InvalidCompressionType => {
Error::InternalError(InternalError::GrenadInvalidCompressionType)
},
}
}
}
}
@ -171,15 +174,15 @@ impl fmt::Display for InternalError {
match self {
Self::DatabaseMissingEntry { db_name, key } => {
write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
},
}
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
Self::Fst(error) => error.fmt(f),
Self::GrenadInvalidCompressionType => {
f.write_str("invalid compression type have been specified to grenad")
},
}
Self::IndexingMergingKeys { process } => {
write!(f, "invalid merge while processing {}", process)
},
}
Self::Serialization(error) => error.fmt(f),
Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
Self::RayonThreadPool(error) => error.fmt(f),
@ -204,12 +207,12 @@ impl fmt::Display for UserError {
Self::InvalidDocumentId { document_id } => {
let json = serde_json::to_string(document_id).unwrap();
write!(f, "document identifier is invalid {}", json)
},
}
Self::InvalidFilterAttribute(error) => error.fmt(f),
Self::MissingDocumentId { document } => {
let json = serde_json::to_string(document).unwrap();
write!(f, "document doesn't have an identifier {}", json)
},
}
Self::MissingPrimaryKey => f.write_str("missing primary key"),
Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
// TODO where can we find it instead of writing the text ourselves?
@ -217,14 +220,14 @@ impl fmt::Display for UserError {
Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
Self::PrimaryKeyCannotBeChanged => {
f.write_str("primary key cannot be changed if the database contains documents")
},
}
Self::PrimaryKeyCannotBeReset => {
f.write_str("primary key cannot be reset if the database contains documents")
},
}
Self::SerdeJson(error) => error.fmt(f),
Self::UnknownInternalDocumentId { document_id } => {
write!(f, "an unknown internal document id have been used ({})", document_id)
},
}
}
}
}
@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry {
match self {
Self::FieldId { field_id, process } => {
write!(f, "unknown field id {} coming from the {} process", field_id, process)
},
}
Self::FieldName { field_name, process } => {
write!(f, "unknown field name {} coming from the {} process", field_name, process)
},
}
}
}
}
@ -251,11 +254,11 @@ impl fmt::Display for SerializationError {
match self {
Self::Decoding { db_name: Some(name) } => {
write!(f, "decoding from the {} database failed", name)
},
}
Self::Decoding { db_name: None } => f.write_str("decoding failed"),
Self::Encoding { db_name: Some(name) } => {
write!(f, "encoding into the {} database failed", name)
},
}
Self::Encoding { db_name: None } => f.write_str("encoding failed"),
Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"),
}

View File

@ -1,6 +1,7 @@
use std::borrow::Cow;
use std::convert::TryInto;
use fst::{Streamer, IntoStreamer};
use fst::{IntoStreamer, Streamer};
pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> {
}
impl<'a> ExternalDocumentsIds<'a> {
pub fn new(hard: fst::Map<Cow<'a, [u8]>>, soft: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft }
}
@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> {
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
// u64 MAX means deleted in the soft fst map
Some(id) if id != u64::MAX => Some(id.try_into().unwrap()),
_otherwise => None
_otherwise => None,
}
}

View File

@ -2,10 +2,9 @@ use std::error::Error;
use std::fmt;
use std::str::FromStr;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FacetType {
String,
Number,
@ -43,4 +42,4 @@ impl fmt::Display for InvalidFacetType {
}
}
impl Error for InvalidFacetType { }
impl Error for InvalidFacetType {}

View File

@ -50,7 +50,7 @@ impl Serialize for FacetValue {
FacetValue::Number(number) => {
let string = number.to_string();
serializer.serialize_str(&string)
},
}
}
}
}

View File

@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] {
#[cfg(test)]
mod tests {
use std::cmp::Ordering::Less;
use super::*;
fn is_sorted<T: Ord>(x: &[T]) -> bool {

View File

@ -1,5 +1,7 @@
use std::collections::BTreeMap;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::FieldId;
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -11,11 +13,7 @@ pub struct FieldsIdsMap {
impl FieldsIdsMap {
pub fn new() -> FieldsIdsMap {
FieldsIdsMap {
names_ids: BTreeMap::new(),
ids_names: BTreeMap::new(),
next_id: Some(0),
}
FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) }
}
/// Returns the number of fields ids in the map.
@ -62,17 +60,17 @@ impl FieldsIdsMap {
}
/// Iterate over the ids and names in the ids order.
pub fn iter(&self) -> impl Iterator<Item=(FieldId, &str)> {
pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> {
self.ids_names.iter().map(|(id, name)| (*id, name.as_str()))
}
/// Iterate over the ids in the order of the ids.
pub fn ids<'a>(&'a self) -> impl Iterator<Item=FieldId> + 'a {
pub fn ids<'a>(&'a self) -> impl Iterator<Item = FieldId> + 'a {
self.ids_names.keys().copied()
}
/// Iterate over the names in the order of the ids.
pub fn names(&self) -> impl Iterator<Item=&str> {
pub fn names(&self) -> impl Iterator<Item = &str> {
self.ids_names.values().map(AsRef::as_ref)
}
}

View File

@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
#[cfg(test)]
mod tests {
use heed::{BytesEncode, BytesDecode};
use heed::{BytesDecode, BytesEncode};
use super::*;
#[test]

View File

@ -1,8 +1,8 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::{FieldId, DocumentId};
use crate::facet::value_encoding::f64_into_bytes;
use crate::{DocumentId, FieldId};
pub struct FieldDocIdFacetF64Codec;

View File

@ -2,12 +2,17 @@ use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::{FieldId, DocumentId};
use crate::{DocumentId, FieldId};
pub struct FieldDocIdFacetStringCodec;
impl FieldDocIdFacetStringCodec {
pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec<u8>) {
pub fn serialize_into(
field_id: FieldId,
document_id: DocumentId,
value: &str,
out: &mut Vec<u8>,
) {
out.reserve(1 + 4 + value.len());
out.push(field_id);
out.extend_from_slice(&document_id.to_be_bytes());

View File

@ -1,4 +1,5 @@
use std::{borrow::Cow, convert::TryInto};
use std::borrow::Cow;
use std::convert::TryInto;
use crate::FieldId;

View File

@ -1,16 +1,18 @@
mod beu32_str_codec;
pub mod facet;
mod field_id_word_count_codec;
mod obkv_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
mod str_level_position_codec;
mod str_str_u8_codec;
mod field_id_word_count_codec;
pub mod facet;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
pub use self::roaring_bitmap_length::{
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
};
pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use obkv::{KvReader, KvWriter};
pub struct ObkvCodec;

View File

@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use heed::{BytesEncode, BytesDecode};
use heed::{BytesDecode, BytesEncode};
use super::*;
#[test]

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use roaring::RoaringBitmap;
pub struct RoaringBitmapCodec;

View File

@ -1,7 +1,7 @@
use std::io::{self, Read, BufRead};
use std::io::{self, BufRead, Read};
use std::mem;
use byteorder::{ReadBytesExt, LittleEndian};
use byteorder::{LittleEndian, ReadBytesExt};
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec {
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(bytes.read_u32::<LittleEndian>()? as usize, true)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(
io::ErrorKind::Other,
"run containers are unsupported",
));
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};
if size > u16::max_value() as usize + 1 {
return Err(io::Error::new(
io::ErrorKind::Other,
"size is greater than supported",
));
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
}
let mut description_bytes = vec![0u8; size * 4];
@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
#[cfg(test)]
mod tests {
use super::*;
use crate::heed_codec::RoaringBitmapCodec;
use heed::BytesEncode;
use roaring::RoaringBitmap;
use super::*;
use crate::heed_codec::RoaringBitmapCodec;
#[test]
fn deserialize_roaring_bitmap_length() {
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();

View File

@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
if bytes.len() < footer_len { return None }
if bytes.len() < footer_len {
return None;
}
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?;

View File

@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet};
use std::path::Path;
use chrono::{DateTime, Utc};
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
use heed::types::*;
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use crate::error::{UserError, FieldIdMapMissingEntry, InternalError};
use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search};
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result};
use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
FieldIdWordCountCodec,
};
use crate::heed_codec::facet::{
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FacetValueStringCodec, FacetLevelValueF64Codec,
};
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::{
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec,
FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search,
StrLevelPositionCodec, StrStrU8Codec, BEU32,
};
pub mod main_key {
pub const CRITERIA_KEY: &str = "criteria";
@ -114,14 +113,17 @@ impl Index {
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let word_prefix_pair_proximity_docids =
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?;
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?;
let word_prefix_level_position_docids =
env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?;
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let field_id_docid_facet_strings =
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let documents = env.create_database(Some(DOCUMENTS))?;
Index::initialize_creation_dates(&env, main)?;
@ -184,18 +186,26 @@ impl Index {
/* documents ids */
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> {
pub(crate) fn put_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids)
}
/// Returns the internal documents ids.
pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default())
Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
}
/// Returns the number of documents indexed in the database.
pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> {
let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
let count =
self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
Ok(count.unwrap_or_default())
}
@ -224,21 +234,30 @@ impl Index {
&self,
wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()>
{
) -> heed::Result<()> {
let ExternalDocumentsIds { hard, soft } = external_documents_ids;
let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?;
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?;
self.main.put::<_, Str, ByteSlice>(
wtxn,
main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
hard,
)?;
self.main.put::<_, Str, ByteSlice>(
wtxn,
main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
soft,
)?;
Ok(())
}
/// Returns the external documents ids map which associate the external ids
/// with the internal ids (i.e. `u32`).
pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let hard =
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let soft =
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let hard = match hard {
Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?,
@ -254,42 +273,62 @@ impl Index {
/// Writes the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
pub(crate) fn put_fields_ids_map(
&self,
wtxn: &mut RwTxn,
map: &FieldsIdsMap,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map)
}
/// Returns the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> {
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(
rtxn,
main_key::FIELDS_IDS_MAP_KEY,
)?.unwrap_or_default())
Ok(self
.main
.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, main_key::FIELDS_IDS_MAP_KEY)?
.unwrap_or_default())
}
/* fields distribution */
/// Writes the fields distribution which associates every field name with
/// the number of times it occurs in the documents.
pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution)
pub(crate) fn put_fields_distribution(
&self,
wtxn: &mut RwTxn,
distribution: &FieldsDistribution,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(
wtxn,
main_key::FIELDS_DISTRIBUTION_KEY,
distribution,
)
}
/// Returns the fields distribution which associates every field name with
/// the number of times it occurs in the documents.
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(
rtxn,
main_key::FIELDS_DISTRIBUTION_KEY,
)?.unwrap_or_default())
Ok(self
.main
.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)?
.unwrap_or_default())
}
/* displayed fields */
/// Writes the fields that must be displayed in the defined order.
/// There must be not be any duplicate field id.
pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields)
pub(crate) fn put_displayed_fields(
&self,
wtxn: &mut RwTxn,
fields: &[&str],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
wtxn,
main_key::DISPLAYED_FIELDS_KEY,
&fields,
)
}
/// Deletes the displayed fields ids, this will make the engine to display
@ -313,14 +352,17 @@ impl Index {
for name in fields.into_iter() {
match fields_ids_map.id(name) {
Some(field_id) => fields_ids.push(field_id),
None => return Err(FieldIdMapMissingEntry::FieldName {
None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(),
process: "Index::displayed_fields_ids",
}.into()),
}
.into())
}
}
}
Ok(Some(fields_ids))
},
}
None => Ok(None),
}
}
@ -328,8 +370,16 @@ impl Index {
/* searchable fields */
/// Writes the searchable fields, when this list is specified, only these are indexed.
pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields)
pub(crate) fn put_searchable_fields(
&self,
wtxn: &mut RwTxn,
fields: &[&str],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
wtxn,
main_key::SEARCHABLE_FIELDS_KEY,
&fields,
)
}
/// Deletes the searchable fields, when no fields are specified, all fields are indexed.
@ -352,14 +402,17 @@ impl Index {
for name in fields {
match fields_ids_map.id(name) {
Some(field_id) => fields_ids.push(field_id),
None => return Err(FieldIdMapMissingEntry::FieldName {
None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(),
process: "Index::searchable_fields_ids",
}.into()),
}
.into())
}
}
}
Ok(Some(fields_ids))
},
}
None => Ok(None),
}
}
@ -367,7 +420,11 @@ impl Index {
/* filterable fields */
/// Writes the filterable fields names in the database.
pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> {
pub(crate) fn put_filterable_fields(
&self,
wtxn: &mut RwTxn,
fields: &HashSet<String>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields)
}
@ -378,10 +435,10 @@ impl Index {
/// Returns the filterable fields names.
pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(
rtxn,
main_key::FILTERABLE_FIELDS_KEY,
)?.unwrap_or_default())
Ok(self
.main
.get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)?
.unwrap_or_default())
}
/// Identical to `filterable_fields`, but returns ids instead.
@ -394,11 +451,14 @@ impl Index {
match fields_ids_map.id(&name) {
Some(field_id) => {
fields_ids.insert(field_id);
},
None => return Err(FieldIdMapMissingEntry::FieldName {
}
None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name,
process: "Index::filterable_fields_ids",
}.into()),
}
.into())
}
}
}
@ -413,9 +473,8 @@ impl Index {
pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> {
let filterable_fields = self.filterable_fields(rtxn)?;
let distinct_field = self.distinct_field(rtxn)?;
let asc_desc_fields = self.criteria(rtxn)?
.into_iter()
.filter_map(|criterion| match criterion {
let asc_desc_fields =
self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion {
Criterion::Asc(field) | Criterion::Desc(field) => Some(field),
_otherwise => None,
});
@ -439,11 +498,14 @@ impl Index {
match fields_ids_map.id(&name) {
Some(field_id) => {
fields_ids.insert(field_id);
},
None => return Err(FieldIdMapMissingEntry::FieldName {
}
None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name,
process: "Index::faceted_fields_ids",
}.into()),
}
.into())
}
}
}
@ -458,8 +520,7 @@ impl Index {
wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
) -> heed::Result<()> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -472,8 +533,7 @@ impl Index {
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -490,8 +550,7 @@ impl Index {
wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
) -> heed::Result<()> {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -504,8 +563,7 @@ impl Index {
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -518,7 +576,11 @@ impl Index {
/* distinct field */
pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> {
pub(crate) fn put_distinct_field(
&self,
wtxn: &mut RwTxn,
distinct_field: &str,
) -> heed::Result<()> {
self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field)
}
@ -532,7 +594,11 @@ impl Index {
/* criteria */
pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> {
pub(crate) fn put_criteria(
&self,
wtxn: &mut RwTxn,
criteria: &[Criterion],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria)
}
@ -550,7 +616,11 @@ impl Index {
/* words fst */
/// Writes the FST which is the words dictionary of the engine.
pub(crate) fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
pub(crate) fn put_words_fst<A: AsRef<[u8]>>(
&self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes())
}
@ -564,7 +634,11 @@ impl Index {
/* stop words */
pub(crate) fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
pub(crate) fn put_stop_words<A: AsRef<[u8]>>(
&self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes())
}
@ -585,8 +659,7 @@ impl Index {
&self,
wtxn: &mut RwTxn,
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
) -> heed::Result<()>
{
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
}
@ -595,15 +668,17 @@ impl Index {
}
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default())
Ok(self
.main
.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?
.unwrap_or_default())
}
pub fn words_synonyms<S: AsRef<str>>(
&self,
rtxn: &RoTxn,
words: &[S],
) -> heed::Result<Option<Vec<Vec<String>>>>
{
) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms(rtxn)?.remove(&words))
}
@ -611,8 +686,16 @@ impl Index {
/* words prefixes fst */
/// Writes the FST which is the words prefixes dictionnary of the engine.
pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes())
pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(
&self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(
wtxn,
main_key::WORDS_PREFIXES_FST_KEY,
fst.as_fst().as_bytes(),
)
}
/// Returns the FST which is the words prefixes dictionnary of the engine.
@ -637,13 +720,14 @@ impl Index {
pub fn documents<'t>(
&self,
rtxn: &'t RoTxn,
ids: impl IntoIterator<Item=DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>>
{
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
let mut documents = Vec::new();
for id in ids {
let kv = self.documents.get(rtxn, &BEU32::new(id))?
let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
.ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?;
documents.push((id, kv));
}
@ -673,7 +757,8 @@ impl Index {
/// Returns the index creation time.
pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
Ok(self.main
Ok(self
.main
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
@ -683,7 +768,8 @@ impl Index {
/// Returns the index last updated time.
pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
Ok(self.main
Ok(self
.main
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
@ -691,7 +777,11 @@ impl Index {
})?)
}
pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime<Utc>) -> heed::Result<()> {
pub(crate) fn set_updated_at(
&self,
wtxn: &mut RwTxn,
time: &DateTime<Utc>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time)
}
}
@ -704,8 +794,8 @@ pub(crate) mod tests {
use maplit::hashmap;
use tempfile::TempDir;
use crate::Index;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::Index;
pub(crate) struct TempIndex {
inner: Index,
@ -728,10 +818,7 @@ pub(crate) mod tests {
options.map_size(100 * 4096);
let _tempdir = TempDir::new_in(".").unwrap();
let inner = Index::new(options, _tempdir.path()).unwrap();
Self {
inner,
_tempdir
}
Self { inner, _tempdir }
}
}
@ -756,10 +843,13 @@ pub(crate) mod tests {
let rtxn = index.read_txn().unwrap();
let fields_distribution = index.fields_distribution(&rtxn).unwrap();
assert_eq!(fields_distribution, hashmap! {
assert_eq!(
fields_distribution,
hashmap! {
"id".to_string() => 2,
"name".to_string() => 2,
"age".to_string() => 1,
});
}
);
}
}

View File

@ -1,14 +1,15 @@
#[macro_use] extern crate pest_derive;
#[macro_use]
extern crate pest_derive;
mod criterion;
mod error;
mod external_documents_ids;
mod fields_ids_map;
mod search;
pub mod facet;
mod fields_ids_map;
pub mod heed_codec;
pub mod index;
pub mod proximity;
mod search;
pub mod tree_level;
pub mod update;
@ -20,15 +21,17 @@ use std::result::Result as StdResult;
use fxhash::{FxHasher32, FxHasher64};
use serde_json::{Map, Value};
pub use self::criterion::{Criterion, default_criteria};
pub use self::criterion::{default_criteria, Criterion};
pub use self::error::Error;
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::heed_codec::{
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
};
pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords};
pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult};
pub use self::tree_level::TreeLevel;
pub type Result<T> = std::result::Result<T, error::Error>;
@ -54,9 +57,9 @@ pub fn obkv_to_json(
displayed_fields: &[FieldId],
fields_ids_map: &FieldsIdsMap,
obkv: obkv::KvReader,
) -> Result<Map<String, Value>>
{
displayed_fields.iter()
) -> Result<Map<String, Value>> {
displayed_fields
.iter()
.copied()
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
.map(|(id, value)| {
@ -72,7 +75,6 @@ pub fn obkv_to_json(
/// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: &Value) -> Option<String> {
fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write;
match value {
@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
}
// check that at least one value was written
count != 0
},
}
Value::Object(object) => {
let mut buffer = String::new();
let mut count = 0;
@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
}
// check that at least one value was written
count != 0
},
}
}
}
@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option<String> {
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
use super::*;
#[test]
fn json_to_string_object() {
let value = json!({

View File

@ -1,4 +1,5 @@
use std::cmp;
use crate::{Attribute, Position};
const ONE_ATTRIBUTE: u32 = 1000;
@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
let (lhs_attr, lhs_index) = extract_position(lhs);
let (rhs_attr, rhs_index) = extract_position(rhs);
if lhs_attr != rhs_attr { MAX_DISTANCE }
else { index_proximity(lhs_index, rhs_index) }
if lhs_attr != rhs_attr {
MAX_DISTANCE
} else {
index_proximity(lhs_index, rhs_index)
}
}
pub fn extract_position(position: Position) -> (Attribute, Position) {

View File

@ -5,12 +5,12 @@ use log::debug;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::error::FieldIdMapMissingEntry;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation;
use crate::{FieldId, Index, Result};
use super::{Criterion, CriterionParameters, CriterionResult};
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> {
ascending: bool,
) -> Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let field_id = fields_ids_map
.id(&field_name)
.ok_or_else(|| FieldIdMapMissingEntry::FieldName {
let field_id =
fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: field_name.clone(),
process: "AscDesc::new",
})?;
@ -101,17 +100,21 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.query_tree = query_tree;
let mut candidates = match (&self.query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => {
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
resolve_query_tree(&context, qt, params.wdcache)?
},
}
(None, None) => self.index.documents_ids(self.rtxn)?,
};
@ -136,9 +139,8 @@ impl<'t> Criterion for AscDesc<'t> {
self.ascending,
candidates & &self.faceted_candidates,
)?;
},
None => return Ok(None),
}
None => return Ok(None),
},
Some(mut candidates) => {
candidates -= params.excluded_candidates;
@ -170,11 +172,8 @@ fn facet_ordered<'t>(
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else {
let facet_fn = if ascending {
FacetIter::new_reducing
} else {
FacetIter::new_reverse_reducing
};
let facet_fn =
if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing };
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>(
for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = index
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));

View File

@ -1,15 +1,16 @@
use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
use std::collections::{BTreeMap, HashMap, btree_map};
use std::borrow::Cow;
use std::cmp::{self, Ordering};
use std::collections::binary_heap::PeekMut;
use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap};
use std::mem::take;
use roaring::RoaringBitmap;
use crate::{TreeLevel, Result, search::build_dfa};
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::Query;
use crate::search::query_tree::{Operation, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree};
use crate::search::{build_dfa, word_derivations, WordDerivationsCache};
use crate::{Result, TreeLevel};
/// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD {
let current_buckets = match self.current_buckets.as_mut() {
Some(current_buckets) => current_buckets,
None => {
let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?;
let new_buckets = linear_compute_candidates(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
)?;
self.current_buckets.get_or_insert(new_buckets.into_iter())
},
}
};
match current_buckets.next() {
@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
}
} else {
match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? {
match set_compute_candidates(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
params.wdcache,
)? {
Some(candidates) => candidates,
None => {
return Ok(Some(CriterionResult {
@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
}
};
allowed_candidates -= &found_candidates;
self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
self.state =
Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
@ -109,13 +120,20 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
@ -131,17 +149,21 @@ impl<'t> Criterion for Attribute<'t> {
self.state = Some((query_tree, flattened_query_tree, candidates));
self.current_buckets = None;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
None => return Ok(None),
},
}
}
@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> {
/// it will begin at the first non-empty interval and will return every interval without
/// jumping over empty intervals.
struct WordLevelIterator<'t, 'q> {
inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
inner: Box<
dyn Iterator<Item = heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't,
>,
level: TreeLevel,
interval_size: u32,
word: Cow<'q, str>,
@ -162,50 +186,81 @@ struct WordLevelIterator<'t, 'q> {
}
impl<'t, 'q> WordLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
fn new(
ctx: &'t dyn Context<'t>,
word: Cow<'q, str>,
in_prefix_cache: bool,
) -> heed::Result<Option<Self>> {
match ctx.word_position_last_level(&word, in_prefix_cache)? {
Some(level) => {
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
},
let inner =
ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
Ok(Some(Self {
inner,
level,
interval_size,
word,
in_prefix_cache,
inner_next: None,
current_interval: None,
}))
}
None => Ok(None),
}
}
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
fn dig(
&self,
ctx: &'t dyn Context<'t>,
level: &TreeLevel,
left_interval: Option<u32>,
) -> heed::Result<Self> {
let level = *level.min(&self.level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let word = self.word.clone();
let in_prefix_cache = self.in_prefix_cache;
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
let inner =
ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
Ok(Self {
inner,
level,
interval_size,
word,
in_prefix_cache,
inner_next: None,
current_interval: None,
})
}
fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left }
fn is_next_interval(last_right: u32, next_left: u32) -> bool {
last_right + 1 == next_left
}
let inner_next = match self.inner_next.take() {
Some(inner_next) => Some(inner_next),
None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)),
None => self
.inner
.next()
.transpose()?
.map(|((_, _, left, right), docids)| (left, right, docids)),
};
match inner_next {
Some((left, right, docids)) => {
match self.current_interval {
Some((left, right, docids)) => match self.current_interval {
Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
let blank_left = last_left + self.interval_size;
let blank_right = last_right + self.interval_size;
self.current_interval = Some((blank_left, blank_right));
self.inner_next = Some((left, right, docids));
Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
},
}
_ => {
self.current_interval = Some((left, right));
Ok(Some((left, right, docids)))
}
}
},
None => Ok(None),
}
@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
ctx: &'t dyn Context<'t>,
queries: &'q [Query],
wdcache: &mut WordDerivationsCache,
) -> Result<Option<Self>>
{
) -> Result<Option<Self>> {
let mut inner = Vec::with_capacity(queries.len());
for query in queries {
match &query.kind {
QueryKind::Exact { word, .. } => {
if !query.prefix || ctx.in_prefix_cache(&word) {
let word = Cow::Borrowed(query.kind.word());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? {
if let Some(word_level_iterator) =
WordLevelIterator::new(ctx, word, query.prefix)?
{
inner.push(word_level_iterator);
}
} else {
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? {
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?
{
let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
if let Some(word_level_iterator) =
WordLevelIterator::new(ctx, word, false)?
{
inner.push(word_level_iterator);
}
}
}
},
}
QueryKind::Tolerant { typo, word } => {
for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? {
for (word, _) in
word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?
{
let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)?
{
inner.push(word_level_iterator);
}
}
@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
Some(parent) => {
let parent = parent.dig(ctx)?;
(parent.level.min(self.level), Some(Box::new(parent)))
},
}
None => (self.level.saturating_sub(1), None),
};
let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten();
let left_interval = self
.accumulator
.get(self.interval_to_skip)
.map(|opt| opt.as_ref().map(|(left, _, _)| *left))
.flatten();
let mut inner = Vec::with_capacity(self.inner.len());
for word_level_iterator in self.inner.iter() {
inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
}
Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0})
Ok(Self {
parent,
inner,
level,
accumulator: vec![],
parent_accumulator: vec![],
interval_to_skip: 0,
})
}
fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
@ -306,11 +379,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
for _ in 0..accumulated_count {
if let Some((next_left, _, next_docids)) = wli.next()? {
accumulated = match accumulated.take(){
accumulated = match accumulated.take() {
Some((acc_left, acc_right, mut acc_docids)) => {
acc_docids |= next_docids;
Some((acc_left, acc_right, acc_docids))
},
}
None => Some((next_left, next_left + interval_size, next_docids)),
};
}
@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
/// return the next meta-interval created from inner WordLevelIterators,
/// and from eventual chainned QueryLevelIterator.
fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
fn next(
&mut self,
allowed_candidates: &RoaringBitmap,
tree_level: TreeLevel,
) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
let parent_result = match self.parent.as_mut() {
Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
None => None,
@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
&self.parent_accumulator,
&self.accumulator,
self.interval_to_skip,
allowed_candidates
allowed_candidates,
);
self.accumulator.push(inner_next);
self.parent_accumulator.push(parent_next);
let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) {
for current in self
.accumulator
.iter()
.rev()
.zip(self.parent_accumulator.iter())
.skip(self.interval_to_skip)
{
if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
match merged_interval.as_mut() {
Some((_, _, merged_docids)) => *merged_docids |= a & b,
None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)),
None => {
merged_interval = Some((left_a + left_b, right_a + right_b, a & b))
}
}
}
}
Ok(merged_interval)
},
}
None => {
let level = self.level;
match self.inner_next(level)? {
@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
candidates &= allowed_candidates;
Ok(Some((left, right, candidates)))
},
}
None => {
self.accumulator = vec![None];
Ok(None)
},
}
}
}
}
@ -379,16 +463,18 @@ fn interval_to_skip(
already_skiped: usize,
allowed_candidates: &RoaringBitmap,
) -> usize {
parent_accumulator.iter()
parent_accumulator
.iter()
.zip(current_accumulator.iter())
.skip(already_skiped)
.take_while(|(parent, current)| {
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
let skip_current = current
.as_ref()
.map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
skip_parent && skip_current
})
.count()
}
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> {
self.last_result = last_result;
self.tree_level = tree_level;
Ok(true)
},
}
None => Ok(false),
}
}
@ -477,7 +563,6 @@ fn initialize_query_level_iterators<'t, 'q>(
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> Result<BinaryHeap<Branch<'t, 'q>>> {
let mut positions = BinaryHeap::with_capacity(branches.len());
for branch in branches {
let mut branch_positions = Vec::with_capacity(branch.len());
@ -488,19 +573,20 @@ fn initialize_query_level_iterators<'t, 'q>(
// the branch seems to be invalid, so we skip it.
branch_positions.clear();
break;
},
}
}
}
// QueryLevelIterator need to be sorted by level and folded in descending order.
branch_positions.sort_unstable_by_key(|qli| qli.level);
let folded_query_level_iterators = branch_positions
.into_iter()
.fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
let folded_query_level_iterators =
branch_positions.into_iter().fold(None, |fold: Option<QueryLevelIterator>, mut qli| {
match fold {
Some(fold) => {
qli.parent(fold);
Some(qli)
},
}
None => Some(qli),
}
});
if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
@ -526,9 +612,9 @@ fn set_compute_candidates<'t>(
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> Result<Option<RoaringBitmap>>
{
let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
) -> Result<Option<RoaringBitmap>> {
let mut branches_heap =
initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
let lowest_level = TreeLevel::min_value();
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
let mut allowed_candidates = allowed_candidates.clone();
@ -539,15 +625,18 @@ fn set_compute_candidates<'t>(
// if current is worst than best we break to return
// candidates that correspond to the best rank
if let Some((best_rank, _)) = final_candidates {
if branch_rank > best_rank { break }
if branch_rank > best_rank {
break;
}
}
let _left = branch.last_result.0;
let candidates = take(&mut branch.last_result.2);
if candidates.is_empty() {
// we don't have candidates, get next interval.
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
if !branch.next(&allowed_candidates)? {
PeekMut::pop(branch);
}
else if is_lowest_level {
} else if is_lowest_level {
// we have candidates, but we can't dig deeper.
allowed_candidates -= &candidates;
final_candidates = match final_candidates.take() {
@ -556,19 +645,20 @@ fn set_compute_candidates<'t>(
best_candidates |= candidates;
branch.lazy_next();
Some((best_rank, best_candidates))
},
}
// we take current candidates as best candidates
None => {
branch.lazy_next();
Some((branch_rank, candidates))
},
}
};
} else {
// we have candidates, lets dig deeper in levels.
branch.dig(ctx)?;
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
if !branch.next(&allowed_candidates)? {
PeekMut::pop(branch);
}
}
}
Ok(final_candidates.map(|(_rank, candidates)| candidates))
@ -578,9 +668,11 @@ fn linear_compute_candidates(
ctx: &dyn Context,
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
) -> Result<BTreeMap<u64, RoaringBitmap>>
{
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
) -> Result<BTreeMap<u64, RoaringBitmap>> {
fn compute_candidate_rank(
branches: &FlattenedQueryTree,
words_positions: HashMap<String, RoaringBitmap>,
) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
let branch_len = branch.len();
@ -593,17 +685,20 @@ fn linear_compute_candidates(
QueryKind::Exact { word, .. } => {
if *prefix {
word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().next()).min()
.flat_map(|positions| positions.iter().next())
.min()
} else {
words_positions.get(word)
words_positions
.get(word)
.map(|positions| positions.iter().next())
.flatten()
}
},
}
QueryKind::Tolerant { typo, word } => {
word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().next()).min()
},
.flat_map(|positions| positions.iter().next())
.min()
}
};
match (position, current_position) {
@ -627,9 +722,11 @@ fn linear_compute_candidates(
branch_rank.sort_unstable();
// because several words in same query can't match all a the position 0,
// we substract the word index to the position.
let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
let branch_rank: u64 =
branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
// here we do the means of the words of the branch
min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
min_rank =
min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
}
}
@ -641,8 +738,7 @@ fn linear_compute_candidates(
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
) -> impl Iterator<Item = &'a RoaringBitmap> {
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
}
}
out
},
}
None => recurse(head),
}
}
fn recurse(op: &Operation) -> FlattenedQueryTree {
match op {
And(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
},
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)),
Or(_, ops) => {
if ops.iter().all(|op| op.query().is_some()) {
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
} else {
ops.iter().map(recurse).flatten().collect()
},
}
}
Phrase(words) => {
let queries = words.iter().map(|word| {
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
}).collect();
let queries = words
.iter()
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]],
@ -712,12 +809,14 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
mod tests {
use big_s::S;
use crate::search::criteria::QueryKind;
use super::*;
use crate::search::criteria::QueryKind;
#[test]
fn simple_flatten_query_tree() {
let query_tree = Operation::Or(false, vec![
let query_tree = Operation::Or(
false,
vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
@ -725,15 +824,28 @@ mod tests {
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("thefish")),
}),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("the")),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("fish")),
}),
]),
],
),
]),
]),
]);
],
);
let expected = vec![
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],

View File

@ -2,19 +2,15 @@ use std::convert::TryFrom;
use std::mem::take;
use std::ops::BitOr;
use itertools::Itertools;
use log::debug;
use roaring::RoaringBitmap;
use itertools::Itertools;
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::search::criteria::{
Context,
Criterion,
CriterionParameters,
CriterionResult,
resolve_query_tree,
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
};
use crate::{TreeLevel, Result};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{Result, TreeLevel};
pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
@ -26,7 +22,11 @@ pub struct Exactness<'t> {
}
impl<'t> Exactness<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>, primitive_query: &[PrimitiveQueryPart]) -> heed::Result<Self> {
pub fn new(
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
primitive_query: &[PrimitiveQueryPart],
) -> heed::Result<Self> {
let mut query: Vec<_> = Vec::with_capacity(primitive_query.len());
for part in primitive_query {
query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?);
@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> {
// reset state
self.state = None;
self.query_tree = None;
},
}
Some(state) => {
let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?;
self.state = state;
@ -70,13 +70,20 @@ impl<'t> Criterion for Exactness<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
@ -90,17 +97,21 @@ impl<'t> Criterion for Exactness<'t> {
self.state = Some(State::new(candidates));
self.query_tree = Some(query_tree);
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
None => return Ok(None),
},
}
}
@ -125,9 +136,9 @@ impl State {
fn difference_with(&mut self, lhs: &RoaringBitmap) {
match self {
Self::ExactAttribute(candidates) |
Self::AttributeStartsWith(candidates) |
Self::ExactWords(candidates) => *candidates -= lhs,
Self::ExactAttribute(candidates)
| Self::AttributeStartsWith(candidates)
| Self::ExactWords(candidates) => *candidates -= lhs,
Self::Remainings(candidates_array) => {
candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs);
candidates_array.retain(|candidates| !candidates.is_empty());
@ -137,9 +148,9 @@ impl State {
fn is_empty(&self) -> bool {
match self {
Self::ExactAttribute(candidates) |
Self::AttributeStartsWith(candidates) |
Self::ExactWords(candidates) => candidates.is_empty(),
Self::ExactAttribute(candidates)
| Self::AttributeStartsWith(candidates)
| Self::ExactWords(candidates) => candidates.is_empty(),
Self::Remainings(candidates_array) => {
candidates_array.iter().all(RoaringBitmap::is_empty)
}
@ -158,8 +169,7 @@ fn resolve_state(
ctx: &dyn Context,
state: State,
query: &[ExactQueryPart],
) -> Result<(RoaringBitmap, Option<State>)>
{
) -> Result<(RoaringBitmap, Option<State>)> {
use State::*;
match state {
ExactAttribute(mut allowed_candidates) => {
@ -167,8 +177,11 @@ fn resolve_state(
if let Ok(query_len) = u8::try_from(query.len()) {
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? {
let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?;
if let Some(attribute_allowed_docids) =
ctx.field_id_word_count_docids(id, query_len)?
{
let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}
@ -181,12 +194,13 @@ fn resolve_state(
}
Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
},
}
AttributeStartsWith(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?;
let attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}
@ -195,7 +209,7 @@ fn resolve_state(
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
Ok((candidates, Some(ExactWords(allowed_candidates))))
},
}
ExactWords(mut allowed_candidates) => {
let number_of_part = query.len();
let mut parts_candidates_array = Vec::with_capacity(number_of_part);
@ -210,7 +224,7 @@ fn resolve_state(
candidates |= synonym_candidates;
}
}
},
}
// compute intersection on pair of words with a proximity of 0.
Phrase(phrase) => {
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
@ -220,8 +234,8 @@ fn resolve_state(
Some(docids) => bitmaps.push(docids),
None => {
bitmaps.clear();
break
},
break;
}
}
}
}
@ -261,7 +275,7 @@ fn resolve_state(
candidates_array.reverse();
Ok((all_exact_candidates, Some(Remainings(candidates_array))))
},
}
// pop remainings candidates until the emptiness
Remainings(mut candidates_array) => {
let candidates = candidates_array.pop().unwrap_or_default();
@ -270,12 +284,15 @@ fn resolve_state(
} else {
Ok((candidates, None))
}
},
}
}
}
fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result<Vec<RoaringBitmap>> {
fn attribute_start_with_docids(
ctx: &dyn Context,
attribute_id: u32,
query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> {
let lowest_level = TreeLevel::min_value();
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex
}
attribute_candidates_array.push(synonyms_candidates);
pos += 1;
},
}
Phrase(phrase) => {
for word in phrase {
let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?;
@ -325,24 +342,30 @@ pub enum ExactQueryPart {
}
impl ExactQueryPart {
fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result<Self> {
fn from_primitive_query_part(
ctx: &dyn Context,
part: &PrimitiveQueryPart,
) -> heed::Result<Self> {
let part = match part {
PrimitiveQueryPart::Word(word, _) => {
match ctx.synonyms(word)? {
Some(synonyms) => {
let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| {
let mut synonyms: Vec<_> = synonyms
.into_iter()
.filter_map(|mut array| {
// keep 1 word synonyms only.
match array.pop() {
Some(word) if array.is_empty() => Some(word),
_ => None,
}
}).collect();
})
.collect();
synonyms.push(word.clone());
ExactQueryPart::Synonyms(synonyms)
},
}
None => ExactQueryPart::Synonyms(vec![word.clone()]),
}
},
}
PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()),
};

View File

@ -1,10 +1,10 @@
use log::debug;
use roaring::RoaringBitmap;
use crate::Result;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache;
use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context};
use crate::Result;
/// The result of a call to the fetcher.
#[derive(Debug, Clone, PartialEq)]
@ -26,7 +26,12 @@ pub struct Final<'t> {
impl<'t> Final<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() }
Final {
ctx,
parent,
wdcache: WordDerivationsCache::new(),
returned_candidates: RoaringBitmap::new(),
}
}
#[logging_timer::time("Final::{}")]
@ -40,10 +45,17 @@ impl<'t> Final<'t> {
};
match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match (candidates, query_tree.as_ref()) {
(Some(candidates), _) => candidates,
(None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates,
(None, Some(qt)) => {
resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates
}
(None, None) => self.ctx.documents_ids()? - excluded_candidates,
};
@ -56,7 +68,7 @@ impl<'t> Final<'t> {
self.returned_candidates |= &candidates;
Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }))
},
}
None => Ok(None),
}
}

View File

@ -1,15 +1,18 @@
use roaring::RoaringBitmap;
use crate::Result;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation;
use super::{Criterion, CriterionResult, CriterionParameters};
use crate::Result;
pub struct Initial {
answer: Option<CriterionResult>
answer: Option<CriterionResult>,
}
impl Initial {
pub fn new(query_tree: Option<Operation>, filtered_candidates: Option<RoaringBitmap>) -> Initial {
pub fn new(
query_tree: Option<Operation>,
filtered_candidates: Option<RoaringBitmap>,
) -> Initial {
let answer = CriterionResult {
query_tree,
candidates: None,

View File

@ -1,29 +1,28 @@
use std::collections::HashMap;
use std::borrow::Cow;
use std::collections::HashMap;
use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
use crate::{Index, DocumentId, Result};
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use self::asc_desc::AscDesc;
use self::attribute::Attribute;
use self::exactness::Exactness;
use self::r#final::Final;
use self::initial::Initial;
use self::proximity::Proximity;
use self::r#final::Final;
use self::typo::Typo;
use self::words::Words;
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::{DocumentId, FieldId, Index, Result, TreeLevel};
mod asc_desc;
mod attribute;
mod exactness;
pub mod r#final;
mod initial;
mod proximity;
mod typo;
mod words;
pub mod r#final;
pub trait Criterion {
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>;
@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> {
#[derive(Debug)]
enum Candidates {
Allowed(RoaringBitmap),
Forbidden(RoaringBitmap)
Forbidden(RoaringBitmap),
}
impl Default for Candidates {
@ -68,17 +67,55 @@ pub trait Context<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>;
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
fn docid_words_positions(
&self,
docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>>;
fn word_position_iterator(
&self,
word: &str,
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
>;
fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>>;
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>;
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>>;
fn field_id_word_count_docids(
&self,
field_id: FieldId,
word_count: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>>;
}
pub struct CriteriaBuilder<'t> {
@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
self.index.word_prefix_docids.get(self.rtxn, &word)
}
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
fn word_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity);
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
}
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity);
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
}
@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
self.words_prefixes_fst.contains(word)
}
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
fn docid_words_positions(
&self,
docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>> {
let mut words_positions = HashMap::new();
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
let ((_, word), positions) = result?;
@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
{
right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
> {
let range = {
let left = left.unwrap_or(u32::min_value());
let right = right.unwrap_or(u32::max_value());
@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
Ok(Box::new(db.range(self.rtxn, &range)?))
}
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
};
let last_level = db
.remap_data_type::<heed::types::DecodeIgnore>()
.range(self.rtxn, &range)?.last().transpose()?
.range(self.rtxn, &range)?
.last()
.transpose()?
.map(|((_, level, _, _), _)| level);
Ok(last_level)
@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
}
}
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
fn field_id_word_count_docids(
&self,
field_id: FieldId,
word_count: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (field_id, word_count);
self.index.field_id_word_count_docids.get(self.rtxn, &key)
}
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>> {
fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (word, level, left, right);
self.index.word_level_position_docids.get(self.rtxn, &key)
}
@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> {
query_tree: Option<Operation>,
primitive_query: Option<Vec<PrimitiveQueryPart>>,
filtered_candidates: Option<RoaringBitmap>,
) -> Result<Final<'t>>
{
) -> Result<Final<'t>> {
use crate::criterion::Criterion as Name;
let primitive_query = primitive_query.unwrap_or_default();
let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
let mut criterion =
Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
for name in self.index.criteria(&self.rtxn)? {
criterion = match name {
Name::Typo => Box::new(Typo::new(self, criterion)),
@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> {
Name::Proximity => Box::new(Proximity::new(self, criterion)),
Name::Attribute => Box::new(Attribute::new(self, criterion)),
Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?),
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?),
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?),
Name::Asc(field) => {
Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?)
}
Name::Desc(field) => {
Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?)
}
};
}
@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
use Operation::{And, Phrase, Or, Query};
) -> Result<RoaringBitmap> {
use Operation::{And, Or, Phrase, Query};
match query_tree {
And(ops) => {
let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, wdcache)
}).collect::<Result<Vec<_>>>()?;
let mut ops = ops
.iter()
.map(|op| resolve_operation(ctx, op, wdcache))
.collect::<Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len());
@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>(
}
}
Ok(candidates)
},
}
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>(
} else {
candidates &= pair_docids;
}
},
None => return Ok(RoaringBitmap::new())
}
None => return Ok(RoaringBitmap::new()),
}
}
Ok(candidates)
},
}
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>(
candidates.union_with(&docids);
}
Ok(candidates)
},
}
Query(q) => Ok(query_docids(ctx, q, wdcache)?),
}
}
@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>(
resolve_operation(ctx, query_tree, wdcache)
}
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context,
left_words: &[(T, u8)],
right_words: &[(U, u8)],
proximity: u8
) -> Result<RoaringBitmap>
{
proximity: u8,
) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new();
for (left, _l_typo) in left_words {
for (right, _r_typo) in right_words {
let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
let current_docids = ctx
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?
.unwrap_or_default();
docids.union_with(&current_docids);
}
}
@ -319,8 +391,7 @@ fn query_docids(
ctx: &dyn Context,
query: &Query,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
match &query.kind {
QueryKind::Exact { word, .. } => {
if query.prefix && ctx.in_prefix_cache(&word) {
@ -336,7 +407,7 @@ fn query_docids(
} else {
Ok(ctx.word_docids(&word)?.unwrap_or_default())
}
},
}
QueryKind::Tolerant { typo, word } => {
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
let mut docids = RoaringBitmap::new();
@ -345,7 +416,7 @@ fn query_docids(
docids.union_with(&current_docids);
}
Ok(docids)
},
}
}
}
@ -355,8 +426,7 @@ fn query_pair_proximity_docids(
right: &Query,
proximity: u8,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
if proximity >= 8 {
let mut candidates = query_docids(ctx, left, wdcache)?;
let right_candidates = query_docids(ctx, right, wdcache)?;
@ -368,20 +438,31 @@ fn query_pair_proximity_docids(
match (&left.kind, &right.kind) {
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
if prefix && ctx.in_prefix_cache(&right) {
Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
Ok(ctx
.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
.unwrap_or_default())
} else if prefix {
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
} else {
Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
Ok(ctx
.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
.unwrap_or_default())
}
}
},
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
let l_words =
word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
if prefix && ctx.in_prefix_cache(&right) {
let mut docids = RoaringBitmap::new();
for (left, _) in l_words {
let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
let current_docids = ctx
.word_prefix_pair_proximity_docids(
left.as_ref(),
right.as_ref(),
proximity,
)?
.unwrap_or_default();
docids.union_with(&current_docids);
}
Ok(docids)
@ -391,28 +472,36 @@ fn query_pair_proximity_docids(
} else {
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
}
},
}
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
},
(QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => {
let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
}
(
QueryKind::Tolerant { typo: l_typo, word: left },
QueryKind::Tolerant { typo: r_typo, word: right },
) => {
let l_words =
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
},
}
}
}
#[cfg(test)]
pub mod test {
use maplit::hashmap;
use rand::{Rng, SeedableRng, rngs::StdRng};
use super::*;
use std::collections::HashMap;
fn s(s: &str) -> String { s.to_string() }
use maplit::hashmap;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
fn s(s: &str) -> String {
s.to_string()
}
pub struct TestContext<'t> {
words_fst: fst::Set<Cow<'t, [u8]>>,
word_docids: HashMap<String, RoaringBitmap>,
@ -435,12 +524,22 @@ pub mod test {
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
}
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
fn word_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_pair_proximity_docids.get(&key).cloned())
}
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
}
@ -453,24 +552,44 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string())
}
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
fn docid_words_positions(
&self,
docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>> {
if let Some(docid_words) = self.docid_words.get(&docid) {
Ok(docid_words
.iter()
.enumerate()
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
.collect()
)
.map(|(i, w)| {
(w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))
})
.collect())
} else {
Ok(HashMap::new())
}
}
fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
fn word_position_iterator(
&self,
_word: &str,
_level: TreeLevel,
_in_prefix_cache: bool,
_left: Option<u32>,
_right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>>
+ 'c,
>,
> {
todo!()
}
fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
fn word_position_last_level(
&self,
_word: &str,
_in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
todo!()
}
@ -482,11 +601,21 @@ pub mod test {
todo!()
}
fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result<Option<RoaringBitmap>> {
fn word_level_position_docids(
&self,
_word: &str,
_level: TreeLevel,
_left: u32,
_right: u32,
) -> heed::Result<Option<RoaringBitmap>> {
todo!()
}
fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
fn field_id_word_count_docids(
&self,
_field_id: FieldId,
_word_count: u8,
) -> heed::Result<Option<RoaringBitmap>> {
todo!()
}
}
@ -506,7 +635,7 @@ pub mod test {
RoaringBitmap::from_sorted_iter(values.into_iter())
}
let word_docids = hashmap!{
let word_docids = hashmap! {
s("hello") => random_postings(rng, 1500),
s("hi") => random_postings(rng, 4000),
s("word") => random_postings(rng, 2500),
@ -530,7 +659,7 @@ pub mod test {
}
}
let word_prefix_docids = hashmap!{
let word_prefix_docids = hashmap! {
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
@ -540,7 +669,9 @@ pub mod test {
let mut word_prefix_pair_proximity_docids = HashMap::new();
for (lword, lcandidates) in &word_docids {
for (rword, rcandidates) in &word_docids {
if lword == rword { continue }
if lword == rword {
continue;
}
let candidates = lcandidates & rcandidates;
for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) {
@ -551,24 +682,31 @@ pub mod test {
} else {
(s(lword), s(rword), (lposition - rposition + 1) as i32)
};
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
let docids = word_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate);
}
}
}
for (pword, pcandidates) in &word_prefix_docids {
if lword.starts_with(pword) { continue }
if lword.starts_with(pword) {
continue;
}
let candidates = lcandidates & pcandidates;
for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) {
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let rposition =
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let key = if lposition < rposition {
(s(lword), s(pword), (rposition - lposition) as i32)
} else {
(s(lword), s(pword), (lposition - rposition + 1) as i32)
};
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
let docids = word_prefix_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate);
}
}

View File

@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::HashMap;
use std::mem::take;
use roaring::RoaringBitmap;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::{build_dfa, WordDerivationsCache};
use crate::search::{query_tree::QueryKind};
use crate::{DocumentId, Position, Result};
use super::{
Context,
Criterion,
CriterionParameters,
CriterionResult,
query_docids,
query_pair_proximity_docids,
resolve_query_tree,
query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion,
CriterionParameters, CriterionResult,
};
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
use crate::search::{build_dfa, WordDerivationsCache};
use crate::{DocumentId, Position, Result};
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> {
}
loop {
debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
debug!(
"Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity,
self.state.as_ref().map(|(mp, _, _)| mp),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match &mut self.state {
Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => {
Some((max_prox, _, allowed_candidates))
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
{
self.state = None; // reset state
},
}
Some((_, query_tree, allowed_candidates)) => {
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD {
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD
&& self.proximity > PROXIMITY_THRESHOLD
{
if let Some(cache) = self.plane_sweep_cache.as_mut() {
match cache.next() {
Some((p, candidates)) => {
self.proximity = p;
candidates
},
}
None => {
self.state = None; // reset state
continue
},
continue;
}
}
} else {
let cache = resolve_plane_sweep_candidates(
@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> {
)?;
self.plane_sweep_cache = Some(cache.into_iter());
continue
continue;
}
} else { // use set theory based algorithm
} else {
// use set theory based algorithm
resolve_candidates(
self.ctx,
&query_tree,
@ -117,13 +117,20 @@ impl<'t> Criterion for Proximity<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
@ -139,17 +146,21 @@ impl<'t> Criterion for Proximity<'t> {
self.state = Some((maximum_proximity as u8, query_tree, candidates));
self.proximity = 0;
self.plane_sweep_cache = None;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
None => return Ok(None),
},
}
}
@ -162,32 +173,33 @@ fn resolve_candidates<'t>(
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
{
use Operation::{And, Phrase, Or};
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
use Operation::{And, Or, Phrase};
let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Phrase(words) => if proximity == 0 {
let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
Phrase(words) => {
if proximity == 0 {
let most_left = words
.first()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.last()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let mut candidates = None;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
match candidates.as_mut() {
Some(pair_docids) => match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids),
}
},
None => {
candidates = None;
@ -201,7 +213,8 @@ fn resolve_candidates<'t>(
}
} else {
Default::default()
},
}
}
Or(_, ops) => {
let mut output = Vec::new();
for op in ops {
@ -209,13 +222,15 @@ fn resolve_candidates<'t>(
output.extend(result);
}
output
},
Operation::Query(q) => if proximity == 0 {
}
Operation::Query(q) => {
if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)]
} else {
Default::default()
},
}
}
};
Ok(result)
@ -228,8 +243,7 @@ fn resolve_candidates<'t>(
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
{
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
}
@ -257,7 +271,8 @@ fn resolve_candidates<'t>(
for (ll, lr, lcandidates) in lefts {
for (rl, rr, rcandidates) in rights {
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
let mut candidates =
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
if lcandidates.len() < rcandidates.len() {
candidates.intersect_with(lcandidates);
candidates.intersect_with(rcandidates);
@ -282,22 +297,26 @@ fn resolve_candidates<'t>(
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>>
{
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
// Extract the first two elements but gives the tail
// that is just after the first element.
let next = branches.split_first().map(|(h1, t)| {
(h1, t.split_first().map(|(h2, _)| (h2, t)))
});
let next =
branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
match next {
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache),
Some((head1, Some((head2, [_])))) => {
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
}
Some((head1, Some((head2, tail)))) => {
let mut output = Vec::new();
for p in 0..=proximity {
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? {
for (lhead, _, head_candidates) in
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
{
if !head_candidates.is_empty() {
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? {
for (_, rtail, mut candidates) in
mdfs(ctx, tail, proximity - p, cache, wdcache)?
{
candidates.intersect_with(&head_candidates);
if !candidates.is_empty() {
output.push((lhead.clone(), rtail, candidates));
@ -307,7 +326,7 @@ fn resolve_candidates<'t>(
}
}
Ok(output)
},
}
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
None => Ok(Default::default()),
}
@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates(
query_tree: &Operation,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> Result<BTreeMap<u8, RoaringBitmap>>
{
) -> Result<BTreeMap<u8, RoaringBitmap>> {
/// FIXME may be buggy with query like "new new york"
fn plane_sweep(
groups_positions: Vec<Vec<(Position, u8, Position)>>,
consecutive: bool,
) -> Result<Vec<(Position, u8, Position)>>
{
) -> Result<Vec<(Position, u8, Position)>> {
fn compute_groups_proximity(
groups: &[(usize, (Position, u8, Position))],
consecutive: bool,
) -> Option<(Position, u8, Position)>
{
) -> Option<(Position, u8, Position)> {
// take the inner proximity of the first group as initial
let (_, (_, mut proximity, _)) = groups.first()?;
let (_, (left_most_pos, _, _)) = groups.first()?;
let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
let (_, (_, _, right_most_pos)) =
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
for pair in groups.windows(2) {
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
// if two positions are equal, meaning that they share at least a word, we return None
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
return None
return None;
}
let pair_proximity = {
// if intervals are disjoint [..].(..)
if lpos2 > rpos1 { lpos2 - rpos1 }
if lpos2 > rpos1 {
lpos2 - rpos1
}
// if the second interval is a subset of the first [.(..).]
else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) }
else if rpos2 < rpos1 {
(lpos2 - lpos1).min(rpos1 - rpos2)
}
// if intervals overlaps [.(..].)
else { (lpos2 - lpos1).min(rpos2 - rpos1) }
else {
(lpos2 - lpos1).min(rpos2 - rpos1)
}
};
// if groups are in the good order (query order) we remove 1 to the proximity
// the proximity is clamped to 7
let pair_proximity = if i1 < i2 {
(pair_proximity - 1).min(7)
} else {
pair_proximity.min(7)
};
let pair_proximity =
if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
proximity += pair_proximity as u8 + prox2;
}
@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates(
let groups_len = groups_positions.len();
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
let mut groups_positions: Vec<_> =
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
// Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len);
@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates(
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Position, u8, Position)>>
{
use Operation::{And, Phrase, Or};
) -> Result<Vec<(Position, u8, Position)>> {
use Operation::{And, Or, Phrase};
if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone());
@ -464,11 +484,18 @@ fn resolve_plane_sweep_candidates(
And(ops) => {
let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
let positions = resolve_operation(
ctx,
operation,
docid,
rocache,
words_positions,
wdcache,
)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
},
}
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates(
groups_positions.push(positions);
}
plane_sweep(groups_positions, true)?
},
}
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
result.extend(resolve_operation(
ctx,
op,
docid,
rocache,
words_positions,
wdcache,
)?)
}
result.sort_unstable();
result
},
}
Operation::Query(Query { prefix, kind }) => {
let mut result = Vec::new();
match kind {
@ -500,7 +534,7 @@ fn resolve_plane_sweep_candidates(
} else if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p)));
}
},
}
QueryKind::Tolerant { typo, word } => {
let iter = word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates(
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
) -> impl Iterator<Item = &'a RoaringBitmap> {
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;

View File

@ -1,20 +1,17 @@
use std::{borrow::Cow, collections::HashMap, mem::take};
use std::borrow::Cow;
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use super::{
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
CriterionResult,
};
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::Result;
use super::{
Candidates,
Context,
Criterion,
CriterionParameters,
CriterionResult,
query_docids,
resolve_query_tree,
};
/// Maximum number of typo for a word of any length.
const MAX_TYPOS_PER_WORD: u8 = 2;
@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> {
}
loop {
debug!("Typo at iteration {} (max typos {:?}) ({:?})",
debug!(
"Typo at iteration {} (max typos {:?}) ({:?})",
self.typos,
self.state.as_ref().map(|(mt, _, _)| mt),
self.state.as_ref().map(|(_, _, cd)| cd),
@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> {
match self.state.as_mut() {
Some((max_typos, _, _)) if self.typos > *max_typos => {
self.state = None; // reset state
},
}
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
self.state = None; // reset state
},
}
Some((_, query_tree, candidates_authorization)) => {
let fst = self.ctx.words_fst();
let new_query_tree = match self.typos {
typos if typos < MAX_TYPOS_PER_WORD => {
alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?
},
typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree(
&fst,
query_tree.clone(),
self.typos,
params.wdcache,
)?,
MAX_TYPOS_PER_WORD => {
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
// we keep the altered query tree
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?;
*query_tree = alterate_query_tree(
&fst,
query_tree.clone(),
self.typos,
params.wdcache,
)?;
// we compute the allowed candidates
let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
let query_tree_allowed_candidates =
resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
// we assign the allowed candidates to the candidates authorization.
*candidates_authorization = match take(candidates_authorization) {
Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates),
Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates),
Allowed(allowed_candidates) => {
Allowed(query_tree_allowed_candidates & allowed_candidates)
}
Forbidden(forbidden_candidates) => {
Allowed(query_tree_allowed_candidates - forbidden_candidates)
}
};
query_tree.clone()
},
}
_otherwise => query_tree.clone(),
};
@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> {
Allowed(allowed_candidates) => {
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
},
}
Forbidden(forbidden_candidates) => {
candidates -= &*forbidden_candidates;
*forbidden_candidates |= &candidates;
},
}
}
let bucket_candidates = match self.bucket_candidates.as_mut() {
@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> {
filtered_candidates: None,
bucket_candidates: Some(bucket_candidates),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.bucket_candidates =
match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
let candidates = match candidates.or(filtered_candidates) {
Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates),
Some(candidates) => {
Candidates::Allowed(candidates - params.excluded_candidates)
}
None => Candidates::Forbidden(params.excluded_candidates.clone()),
};
let maximum_typos = maximum_typo(&query_tree) as u8;
self.state = Some((maximum_typos, query_tree, candidates));
self.typos = 0;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
None => return Ok(None),
},
}
}
@ -164,21 +185,19 @@ fn alterate_query_tree(
mut query_tree: Operation,
number_typos: u8,
wdcache: &mut WordDerivationsCache,
) -> Result<Operation>
{
) -> Result<Operation> {
fn recurse(
words_fst: &fst::Set<Cow<[u8]>>,
operation: &mut Operation,
number_typos: u8,
wdcache: &mut WordDerivationsCache,
) -> Result<()>
{
use Operation::{And, Phrase, Or};
) -> Result<()> {
use Operation::{And, Or, Phrase};
match operation {
And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
},
}
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()),
Operation::Query(q) => {
@ -193,19 +212,25 @@ fn alterate_query_tree(
} else {
let typo = *typo.min(&number_typos);
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
let queries = words.iter().map(|(word, typo)| {
let queries = words
.iter()
.map(|(word, typo)| {
Operation::Query(Query {
prefix: false,
kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() },
kind: QueryKind::Exact {
original_typo: *typo,
word: word.to_string(),
},
})
}).collect();
})
.collect();
*operation = Operation::or(false, queries);
}
}
Ok(())
},
}
}
}
@ -219,22 +244,18 @@ fn resolve_candidates<'t>(
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
use Operation::{And, Phrase, Or, Query};
) -> Result<RoaringBitmap> {
use Operation::{And, Or, Phrase, Query};
match query_tree {
And(ops) => {
mdfs(ctx, ops, number_typos, cache, wdcache)
},
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
@ -250,12 +271,12 @@ fn resolve_candidates<'t>(
} else {
candidates &= pair_docids;
}
},
None => return Ok(RoaringBitmap::new())
}
None => return Ok(RoaringBitmap::new()),
}
}
Ok(candidates)
},
}
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
@ -263,12 +284,14 @@ fn resolve_candidates<'t>(
candidates.union_with(&docids);
}
Ok(candidates)
},
Query(q) => if q.kind.typo() == number_typos {
}
Query(q) => {
if q.kind.typo() == number_typos {
Ok(query_docids(ctx, q, wdcache)?)
} else {
Ok(RoaringBitmap::new())
},
}
}
}
}
@ -278,8 +301,7 @@ fn resolve_candidates<'t>(
mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
match branches.split_first() {
Some((head, [])) => {
let cache_key = (head.clone(), mana);
@ -290,7 +312,7 @@ fn resolve_candidates<'t>(
cache.insert(cache_key, candidates.clone());
Ok(candidates)
}
},
}
Some((head, tail)) => {
let mut candidates = RoaringBitmap::new();
@ -313,7 +335,7 @@ fn resolve_candidates<'t>(
}
Ok(candidates)
},
}
None => Ok(RoaringBitmap::new()),
}
}
@ -323,9 +345,9 @@ fn resolve_candidates<'t>(
#[cfg(test)]
mod test {
use super::*;
use super::super::initial::Initial;
use super::super::test::TestContext;
use super::*;
#[test]
fn initial_placeholder_no_facets() {
@ -348,13 +370,23 @@ mod test {
#[test]
fn initial_query_tree_no_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
])
]);
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = None;
@ -369,13 +401,23 @@ mod test {
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
])),
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(candidates_1.clone()),
bucket_candidates: Some(candidates_1),
filtered_candidates: None,
@ -383,22 +425,37 @@ mod test {
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
let candidates_2 = (
context.word_docids("split").unwrap().unwrap()
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap()
) - context.word_docids("world").unwrap().unwrap();
& context.word_docids("word").unwrap().unwrap())
- context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
]),
])),
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(candidates_2.clone()),
bucket_candidates: Some(candidates_2),
filtered_candidates: None,
@ -437,17 +494,26 @@ mod test {
#[test]
fn initial_query_tree_with_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
])
]);
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
@ -459,13 +525,23 @@ mod test {
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
])),
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: Some(&candidates_1 & &facet_candidates),
filtered_candidates: None,
@ -473,22 +549,37 @@ mod test {
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
let candidates_2 = (
context.word_docids("split").unwrap().unwrap()
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap()
) - context.word_docids("world").unwrap().unwrap();
& context.word_docids("word").unwrap().unwrap())
- context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
]),
])),
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: Some(&candidates_2 & &facet_candidates),
filtered_candidates: None,

View File

@ -3,9 +3,9 @@ use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation;
use crate::Result;
use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree};
pub struct Words<'t> {
ctx: &'t dyn Context<'t>,
@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> {
Some(query_tree) => {
let candidates = match self.candidates.as_mut() {
Some(allowed_candidates) => {
let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
let mut candidates =
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
Some(candidates)
},
}
None => None,
};
@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> {
filtered_candidates: self.filtered_candidates.clone(),
bucket_candidates,
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.query_trees = explode_query_tree(query_tree);
self.candidates = candidates;
self.filtered_candidates = filtered_candidates;
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
self.bucket_candidates =
match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
None => return Ok(None),
},
}
}

View File

@ -3,11 +3,11 @@ use std::mem::size_of;
use heed::types::ByteSlice;
use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::error::InternalError;
use crate::heed_codec::facet::*;
use crate::index::db_name;
use crate::{DocumentId, FieldId, Index, Result};
use super::{Distinct, DocIter};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> {
impl<'a> FacetDistinct<'a> {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
Self {
distinct,
index,
txn,
}
Self { distinct, index, txn }
}
}
@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> {
impl<'a> FacetDistinctIter<'a> {
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index
.facet_id_string_docids
.get(self.txn, &(self.distinct, key))
self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
}
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
// get facet docids on level 0
self.index
.facet_id_f64_docids
.get(self.txn, &(self.distinct, 0, key, key))
self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key))
}
fn distinct_string(&mut self, id: DocumentId) -> Result<()> {
@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> {
for item in iter {
let ((_, _, value), _) = item?;
let facet_docids = self
.facet_string_docids(value)?
.ok_or(InternalError::DatabaseMissingEntry {
let facet_docids =
self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_STRING_DOCIDS,
key: None,
})?;
@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> {
for item in iter {
let ((_, _, value), _) = item?;
let facet_docids = self
.facet_number_docids(value)?
.ok_or(InternalError::DatabaseMissingEntry {
let facet_docids =
self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_F64_DOCIDS,
key: None,
})?;

View File

@ -1,11 +1,11 @@
mod facet_distinct;
mod noop_distinct;
pub use facet_distinct::FacetDistinct;
pub use noop_distinct::NoopDistinct;
use roaring::RoaringBitmap;
use crate::{DocumentId, Result};
pub use facet_distinct::FacetDistinct;
pub use noop_distinct::NoopDistinct;
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
/// It provides a way to get back the ownership to the excluded set.
@ -29,13 +29,15 @@ mod test {
use std::collections::HashSet;
use once_cell::sync::Lazy;
use rand::{seq::SliceRandom, Rng};
use rand::seq::SliceRandom;
use rand::Rng;
use roaring::RoaringBitmap;
use serde_json::{json, Value};
use crate::index::{Index, tests::TempIndex};
use crate::index::tests::TempIndex;
use crate::index::Index;
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use crate::{BEU32, FieldId, DocumentId};
use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Value> = Lazy::new(generate_json);
@ -89,9 +91,7 @@ mod test {
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
addition.update_format(UpdateFormat::Json);
addition
.execute(JSON.to_string().as_bytes(), |_, _| ())
.unwrap();
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(&distinct).unwrap();
@ -103,7 +103,6 @@ mod test {
(index, fid, map)
}
/// Checks that all the candidates are distinct, and returns the candidates number.
pub(crate) fn validate_distinct_candidates(
candidates: impl Iterator<Item = crate::Result<DocumentId>>,
@ -117,7 +116,7 @@ mod test {
let s = value.to_string();
assert!(seen.insert(s));
}
Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))}
Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)),
}
}

View File

@ -1,7 +1,8 @@
use roaring::{RoaringBitmap, bitmap::IntoIter};
use roaring::bitmap::IntoIter;
use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::{DocumentId, Result};
use super::{DocIter, Distinct};
/// A distinct implementer that does not perform any distinct,
/// and simply returns an iterator to the candidates.
@ -30,10 +31,7 @@ impl Distinct for NoopDistinct {
type Iter = NoopDistinctIter;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
NoopDistinctIter {
candidates: candidates.into_iter(),
excluded,
}
NoopDistinctIter { candidates: candidates.into_iter(), excluded }
}
}

View File

@ -1,16 +1,16 @@
use std::collections::{HashSet, BTreeMap};
use std::collections::{BTreeMap, HashSet};
use std::ops::Bound::Unbounded;
use std::{cmp, fmt};
use heed::{Database, BytesDecode};
use heed::types::{ByteSlice, Unit};
use heed::{BytesDecode, Database};
use roaring::RoaringBitmap;
use crate::error::FieldIdMapMissingEntry;
use crate::facet::FacetType;
use crate::heed_codec::facet::FacetValueStringCodec;
use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId, Result};
use crate::{DocumentId, FieldId, Index, Result};
/// The default number of values by facets that will
/// be fetched from the key-value store.
@ -43,7 +43,7 @@ impl<'a> FacetDistribution<'a> {
}
}
pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
pub fn facets<I: IntoIterator<Item = A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect());
self
}
@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> {
facet_type: FacetType,
candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
) -> heed::Result<()> {
fn fetch_facet_values<'t, KC, K: 't>(
rtxn: &'t heed::RoTxn,
db: Database<KC, Unit>,
@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> {
FacetType::Number => {
let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
},
}
FacetType::String => {
let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> {
field_id: FieldId,
candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
let iter = FacetIter::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
) -> heed::Result<()> {
let iter =
FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
for result in iter {
let (value, mut docids) = result?;
@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> {
fn facet_values_from_raw_facet_database(
&self,
field_id: FieldId,
) -> heed::Result<BTreeMap<String, u64>>
{
) -> heed::Result<BTreeMap<String, u64>> {
let mut distribution = BTreeMap::new();
let db = self.index.facet_id_f64_docids;
@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> {
}
}
let iter = self.index
let iter = self
.index
.facet_id_string_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])?
@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> {
// to those candidates. We also enter here for facet strings for performance reasons.
let mut distribution = BTreeMap::new();
if candidates.len() <= CANDIDATES_THRESHOLD {
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
self.facet_distribution_from_documents(
field_id,
Number,
candidates,
&mut distribution,
)?;
self.facet_distribution_from_documents(
field_id,
String,
candidates,
&mut distribution,
)?;
} else {
self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
self.facet_numbers_distribution_from_facet_levels(
field_id,
candidates,
&mut distribution,
)?;
self.facet_distribution_from_documents(
field_id,
String,
candidates,
&mut distribution,
)?;
}
Ok(distribution)
@ -201,7 +217,8 @@ impl<'a> FacetDistribution<'a> {
let mut distribution = BTreeMap::new();
for name in filterable_fields {
let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
let fid =
fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: name.clone(),
process: "FacetDistribution::execute",
})?;
@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> {
impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let FacetDistribution {
facets,
candidates,
max_values_by_facet,
rtxn: _,
index: _,
} = self;
let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self;
f.debug_struct("FacetDistribution")
.field("facets", facets)

View File

@ -1,6 +1,6 @@
use std::collections::HashSet;
use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded};
use std::ops::Bound::{self, Excluded, Included};
use std::result::Result as StdResult;
use std::str::FromStr;
@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs};
use pest::Parser;
use roaring::RoaringBitmap;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result};
use super::FacetRange;
use super::parser::Rule;
use super::parser::{PREC_CLIMBER, FilterParser};
use self::FilterCondition::*;
use self::Operator::*;
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
use super::FacetRange;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec};
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq)]
pub enum Operator {
@ -63,8 +60,9 @@ impl FilterCondition {
index: &Index,
array: I,
) -> Result<Option<FilterCondition>>
where I: IntoIterator<Item=Either<J, B>>,
J: IntoIterator<Item=A>,
where
I: IntoIterator<Item = Either<J, B>>,
J: IntoIterator<Item = A>,
A: AsRef<str>,
B: AsRef<str>,
{
@ -88,7 +86,7 @@ impl FilterCondition {
None => Some(rule),
};
}
},
}
Either::Right(rule) => {
let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?;
ands = match ands.take() {
@ -106,11 +104,11 @@ impl FilterCondition {
rtxn: &heed::RoTxn,
index: &Index,
expression: &str,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let filterable_fields = index.filterable_fields_ids(rtxn)?;
let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
let lexed =
FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed)
}
@ -118,8 +116,7 @@ impl FilterCondition {
fim: &FieldsIdsMap,
ff: &HashSet<FieldId>,
expression: Pairs<Rule>,
) -> Result<Self>
{
) -> Result<Self> {
PREC_CLIMBER.climb(
expression,
|pair: Pair<Rule>| match pair.as_rule() {
@ -135,12 +132,10 @@ impl FilterCondition {
Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
_ => unreachable!(),
},
|lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| {
match op.as_rule() {
|lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| match op.as_rule() {
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
_ => unreachable!(),
}
},
)
}
@ -160,8 +155,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -179,8 +173,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -196,8 +189,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -213,8 +205,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -230,8 +221,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -247,8 +237,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> Result<FilterCondition>
{
) -> Result<FilterCondition> {
let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?;
@ -272,13 +261,14 @@ impl FilterCondition {
left: Bound<f64>,
right: Bound<f64>,
output: &mut RoaringBitmap,
) -> Result<()>
{
) -> Result<()> {
match (left, right) {
// If the request is an exact value we must go directly to the deepest level.
(Included(l), Included(r)) if l == r && level > 0 => {
return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output);
},
return Self::explore_facet_number_levels(
rtxn, db, field_id, 0, left, right, output,
);
}
// lower TO upper when lower > upper must return no result
(Included(l), Included(r)) if l > r => return Ok(()),
(Included(l), Excluded(r)) if l >= r => return Ok(()),
@ -301,7 +291,9 @@ impl FilterCondition {
debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
output.union_with(&docids);
// We save the leftest and rightest bounds we actually found at this level.
if i == 0 { left_found = Some(l); }
if i == 0 {
left_found = Some(l);
}
right_found = Some(r);
}
@ -318,20 +310,50 @@ impl FilterCondition {
// If the bound is satisfied we avoid calling this function again.
if !matches!(left, Included(l) if l == left_found) {
let sub_right = Excluded(left_found);
debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level);
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?;
debug!(
"calling left with {:?} to {:?} (level {})",
left, sub_right, deeper_level
);
Self::explore_facet_number_levels(
rtxn,
db,
field_id,
deeper_level,
left,
sub_right,
output,
)?;
}
if !matches!(right, Included(r) if r == right_found) {
let sub_left = Excluded(right_found);
debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level);
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?;
debug!(
"calling right with {:?} to {:?} (level {})",
sub_left, right, deeper_level
);
Self::explore_facet_number_levels(
rtxn,
db,
field_id,
deeper_level,
sub_left,
right,
output,
)?;
}
}
},
None => {
// If we found nothing at this level it means that we must find
// the same bounds but at a deeper, more precise level.
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?;
},
Self::explore_facet_number_levels(
rtxn,
db,
field_id,
deeper_level,
left,
right,
output,
)?;
}
}
Ok(())
@ -344,8 +366,7 @@ impl FilterCondition {
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: &Operator,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
@ -358,13 +379,21 @@ impl FilterCondition {
Some(n) => {
let n = Included(*n);
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?;
Self::explore_facet_number_levels(
rtxn,
numbers_db,
field_id,
0,
n,
n,
&mut output,
)?;
output
},
}
None => RoaringBitmap::new(),
};
return Ok(string_docids | number_docids);
},
}
NotEqual(number, string) => {
let all_numbers_ids = if number.is_some() {
index.number_faceted_documents_ids(rtxn, field_id)?
@ -373,9 +402,11 @@ impl FilterCondition {
};
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Equal(*number, string.clone());
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?;
let docids = Self::evaluate_operator(
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
},
}
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
Between(left, right) => (Included(*left), Included(*right)),
@ -391,36 +422,39 @@ impl FilterCondition {
match biggest_level {
Some(level) => {
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?;
Self::explore_facet_number_levels(
rtxn,
numbers_db,
field_id,
level,
left,
right,
&mut output,
)?;
Ok(output)
},
}
None => Ok(RoaringBitmap::new()),
}
}
pub fn evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
) -> Result<RoaringBitmap>
{
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
match self {
Operator(fid, op) => {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
},
}
Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs | rhs)
},
}
And(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs & rhs)
},
}
}
}
}
@ -434,14 +468,14 @@ fn field_id(
fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>,
items: &mut Pairs<Rule>,
) -> StdResult<FieldId, PestError<Rule>>
{
) -> StdResult<FieldId, PestError<Rule>> {
// lexing ensures that we at least have a key
let key = items.next().unwrap();
let field_id = match fields_ids_map.id(key.as_str()) {
Some(field_id) => field_id,
None => return Err(PestError::new_from_span(
None => {
return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
@ -450,7 +484,8 @@ fn field_id(
),
},
key.as_span(),
)),
))
}
};
if !filterable_fields.contains(&field_id) {
@ -459,9 +494,11 @@ fn field_id(
message: format!(
"attribute `{}` is not filterable, available filterable attributes are: {}",
key.as_str(),
filterable_fields.iter().flat_map(|id| {
fields_ids_map.name(*id)
}).collect::<Vec<_>>().join(", "),
filterable_fields
.iter()
.flat_map(|id| { fields_ids_map.name(*id) })
.collect::<Vec<_>>()
.join(", "),
),
},
key.as_span(),
@ -476,7 +513,8 @@ fn field_id(
///
/// Returns the parsing error associated with the span if the conversion fails.
fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String)
where T: FromStr,
where
T: FromStr,
T::Err: ToString,
{
let result = match pair.as_str().parse::<T>() {
@ -492,11 +530,12 @@ where T: FromStr,
#[cfg(test)]
mod tests {
use super::*;
use crate::update::Settings;
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
use big_s::S;
use super::*;
use crate::update::Settings;
#[test]
fn string() {
@ -508,7 +547,7 @@ mod tests {
// Set the filterable fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_filterable_fields(hashset!{ S("channel") });
builder.set_filterable_fields(hashset! { S("channel") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -537,7 +576,7 @@ mod tests {
// Set the filterable fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_filterable_fields(hashset!{ "timestamp".into() });
builder.set_filterable_fields(hashset! { "timestamp".into() });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -548,10 +587,8 @@ mod tests {
assert_eq!(condition, expected);
let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or(
Box::new(Operator(0, LowerThan(22.0))),
Box::new(Operator(0, GreaterThan(44.0))),
);
let expected =
Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0))));
assert_eq!(condition, expected);
}
@ -566,29 +603,33 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") });
builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FilterCondition::from_str(
&rtxn, &index,
&rtxn,
&index,
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
)
.unwrap();
let expected = Or(
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(And(
Box::new(Operator(1, Between(22.0, 44.0))),
Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
))
)),
);
assert_eq!(condition, expected);
let condition = FilterCondition::from_str(
&rtxn, &index,
&rtxn,
&index,
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
)
.unwrap();
let expected = Or(
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(Or(
@ -613,20 +654,28 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") });
builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FilterCondition::from_array(
&rtxn, &index,
vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])],
).unwrap().unwrap();
&rtxn,
&index,
vec![
Either::Right("channel = gotaga"),
Either::Left(vec!["timestamp = 44", "channel != ponce"]),
],
)
.unwrap()
.unwrap();
let expected = FilterCondition::from_str(
&rtxn, &index,
&rtxn,
&index,
"channel = gotaga AND (timestamp = 44 OR channel != ponce)",
).unwrap();
)
.unwrap();
assert_eq!(condition, expected);
}
}

View File

@ -1,20 +1,19 @@
use std::ops::Bound::{self, Included, Excluded, Unbounded};
use std::ops::Bound::{self, Excluded, Included, Unbounded};
use either::Either::{self, Left, Right};
use heed::types::{DecodeIgnore, ByteSlice};
use heed::{Database, RoRange, RoRevRange, LazyDecode};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{Database, LazyDecode, RoRange, RoRevRange};
use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{Index, FieldId};
pub use self::facet_distribution::FacetDistribution;
pub use self::filter_condition::{FilterCondition, Operator};
pub(crate) use self::parser::Rule as ParserRule;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{FieldId, Index};
mod filter_condition;
mod facet_distribution;
mod filter_condition;
mod parser;
pub struct FacetRange<'t> {
@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> {
level: u8,
left: Bound<f64>,
right: Bound<f64>,
) -> heed::Result<FacetRange<'t>>
{
) -> heed::Result<FacetRange<'t>> {
let left_bound = match left {
Included(left) => Included((field_id, level, left, f64::MIN)),
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> {
} else {
None
}
},
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> {
level: u8,
left: Bound<f64>,
right: Bound<f64>,
) -> heed::Result<FacetRevRange<'t>>
{
) -> heed::Result<FacetRevRange<'t>> {
let left_bound = match left {
Included(left) => Included((field_id, level, left, f64::MIN)),
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> {
}
}
continue;
},
}
Some(Err(e)) => return Some(Err(e)),
None => return None,
}
@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
) -> heed::Result<FacetIter<'t>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let highest_iter =
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
}
@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
) -> heed::Result<FacetIter<'t>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let highest_iter =
FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Right(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
}
@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
) -> heed::Result<FacetIter<'t>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let highest_iter =
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
}
@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> {
rtxn: &'t heed::RoTxn,
db: Database<FacetLevelValueF64Codec, X>,
fid: FieldId,
) -> heed::Result<Option<u8>>
{
let level = db.remap_types::<ByteSlice, DecodeIgnore>()
) -> heed::Result<Option<u8>> {
let level = db
.remap_types::<ByteSlice, DecodeIgnore>()
.prefix_iter(rtxn, &[fid][..])?
.remap_key_type::<FacetLevelValueF64Codec>()
.last().transpose()?
.last()
.transpose()?
.map(|((_, level, _, _), _)| level);
Ok(level)
}
@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> {
match result {
Ok(((_fid, level, left, right), mut docids)) => {
docids.intersect_with(&documents_ids);
if !docids.is_empty() {
if self.must_reduce {
@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> {
Ok(iter) => {
self.level_iters.push((docids, iter));
continue 'outer;
},
}
Err(e) => return Some(Err(e)),
}
}
},
}
Err(e) => return Some(Err(e)),
}
}

View File

@ -1,5 +1,5 @@
use once_cell::sync::Lazy;
use pest::prec_climber::{Operator, Assoc, PrecClimber};
use pest::prec_climber::{Assoc, Operator, PrecClimber};
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
use Assoc::*;

View File

@ -1,13 +1,11 @@
use std::collections::HashSet;
use std::cmp::{min, Reverse};
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut};
use levenshtein_automata::{DFA, Distance};
use crate::search::query_tree::{Operation, Query};
use levenshtein_automata::{Distance, DFA};
use super::build_dfa;
use crate::search::query_tree::{Operation, Query};
type IsPrefix = bool;
@ -28,7 +26,9 @@ impl MatchingWords {
.collect();
// Sort word by len in DESC order prioritizing the longuest word,
// in order to highlight the longuest part of the matched word.
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len()));
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
Reverse(query_word.len())
});
Self { dfas }
}
@ -37,12 +37,13 @@ impl MatchingWords {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes());
let (_dist, len) =
prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes());
Some(len)
} else {
Some(word.len())
}
},
}
_otherwise => None,
})
}
@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
match tree {
Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
},
}
Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix));
},
}
Operation::Phrase(words) => {
for word in words {
out.insert((word, 0, false));
@ -80,10 +81,7 @@ struct N2Array<T> {
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
N2Array { y_size: y, buf: vec![value; x * y] }
}
}
@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
#[cfg(test)]
mod tests {
use super::*;
use crate::MatchingWords;
use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
#[test]
fn matched_length() {
@ -194,13 +191,23 @@ mod tests {
#[test]
fn matching_words() {
let query_tree = Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }),
]),
]);
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let matching_words = MatchingWords::from_query_tree(&query_tree);

View File

@ -6,6 +6,7 @@ use std::result::Result as StdResult;
use std::str::Utf8Error;
use std::time::Instant;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::debug;
@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub(crate) use self::facet::ParserRule;
pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator};
pub use self::matching_words::MatchingWords;
use self::query_tree::QueryTreeBuilder;
use crate::error::FieldIdMapMissingEntry;
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId, Result};
pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator};
pub use self::matching_words::MatchingWords;
pub(crate) use self::facet::ParserRule;
use self::query_tree::QueryTreeBuilder;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use crate::{DocumentId, Index, Result};
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -32,8 +30,8 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod criteria;
mod distinct;
mod facet;
mod query_tree;
mod matching_words;
mod query_tree;
pub struct Search<'a> {
query: Option<String>,
@ -117,7 +115,7 @@ impl<'a> Search<'a> {
let result = analyzer.analyze(query);
let tokens = result.tokens();
builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq)))
},
}
None => (None, None),
};
@ -144,7 +142,8 @@ impl<'a> Search<'a> {
None => self.perform_sort(NoopDistinct, matching_words, criteria),
Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
let id =
field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(),
process: "distinct attribute",
})?;
@ -159,14 +158,15 @@ impl<'a> Search<'a> {
mut distinct: D,
matching_words: MatchingWords,
mut criteria: Final,
) -> Result<SearchResult>
{
) -> Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new();
let mut documents_ids = Vec::with_capacity(self.limit);
while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? {
while let Some(FinalResult { candidates, bucket_candidates, .. }) =
criteria.next(&excluded_candidates)?
{
debug!("Number of candidates found {}", candidates.len());
let excluded = take(&mut excluded_candidates);
@ -183,7 +183,9 @@ impl<'a> Search<'a> {
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
documents_ids.push(candidate?);
}
if documents_ids.len() == self.limit { break }
if documents_ids.len() == self.limit {
break;
}
excluded_candidates = candidates.into_excluded();
}
@ -247,7 +249,7 @@ pub fn word_derivations<'c>(
}
Ok(entry.insert(derived_words))
},
}
}
}

View File

@ -1,4 +1,4 @@
use std::{fmt, cmp, mem};
use std::{cmp, fmt, mem};
use fst::Set;
use meilisearch_tokenizer::token::SeparatorKind;
@ -28,18 +28,18 @@ impl fmt::Debug for Operation {
Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
}
Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
},
}
Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
}
Operation::Or(false, children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
}
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
}
}
@ -136,10 +136,12 @@ impl fmt::Debug for Query {
match kind {
QueryKind::Exact { word, .. } => {
f.debug_struct(&(prefix + "Exact")).field("word", &word).finish()
},
QueryKind::Tolerant { typo, word } => {
f.debug_struct(&(prefix + "Tolerant")).field("word", &word).field("max typo", &typo).finish()
},
}
QueryKind::Tolerant { typo, word } => f
.debug_struct(&(prefix + "Tolerant"))
.field("word", &word)
.field("max typo", &typo)
.finish(),
}
}
}
@ -223,7 +225,12 @@ impl<'a> QueryTreeBuilder<'a> {
let stop_words = self.index.stop_words(self.rtxn)?;
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
if !primitive_query.is_empty() {
let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?;
let qt = create_query_tree(
self,
self.optional_words,
self.authorize_typos,
&primitive_query,
)?;
Ok(Some((qt, primitive_query)))
} else {
Ok(None)
@ -248,12 +255,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
}
}
Ok(best.map(|(_, left, right)| Operation::Phrase(
vec![
left.to_string(),
right.to_string()
]
)))
Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
}
/// Return the `QueryKind` of a word depending on `authorize_typos`
@ -276,12 +278,18 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
let synonyms = ctx.synonyms(word)?;
Ok(synonyms.map(|synonyms| {
synonyms.into_iter().map(|synonym| {
let words = synonym.into_iter().map(|word| {
synonyms
.into_iter()
.map(|synonym| {
let words = synonym
.into_iter()
.map(|word| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) })
}).collect();
})
.collect();
Operation::and(words)
}).collect()
})
.collect()
}))
}
@ -291,15 +299,13 @@ fn create_query_tree(
optional_words: bool,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
) -> Result<Operation>
{
) -> Result<Operation> {
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part(
ctx: &impl Context,
authorize_typos: bool,
part: PrimitiveQueryPart,
) -> Result<Operation>
{
) -> Result<Operation> {
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
@ -310,13 +316,12 @@ fn create_query_tree(
if let Some(child) = split_best_frequency(ctx, &word)? {
children.push(child);
}
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
children
.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
Ok(Operation::or(false, children))
},
}
// create a CONSECUTIVE operation wrapping all word in the phrase
PrimitiveQueryPart::Phrase(words) => {
Ok(Operation::phrase(words))
},
PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)),
}
}
@ -325,8 +330,7 @@ fn create_query_tree(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
) -> Result<Operation>
{
) -> Result<Operation> {
const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new();
@ -341,21 +345,26 @@ fn create_query_tree(
match group {
[part] => {
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?;
let operation =
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
and_op_children.push(operation);
},
}
words => {
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
let words: Vec<_> = words.iter().filter_map(|part| {
let words: Vec<_> = words
.iter()
.filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str())
} else {
None
}
}).collect();
})
.collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat();
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
let query =
Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations));
}
@ -379,15 +388,16 @@ fn create_query_tree(
ctx: &impl Context,
authorize_typos: bool,
query: PrimitiveQuery,
) -> Result<Operation>
{
) -> Result<Operation> {
let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
let mut operation_children = Vec::new();
let start = number_phrases + (number_phrases == 0) as usize;
for len in start..=query.len() {
let mut word_count = len - number_phrases;
let query: Vec<_> = query.iter().filter(|p| {
let query: Vec<_> = query
.iter()
.filter(|p| {
if p.is_phrase() {
true
} else if word_count != 0 {
@ -434,7 +444,11 @@ impl PrimitiveQueryPart {
/// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, words_limit: Option<usize>) -> PrimitiveQuery {
fn create_primitive_query(
query: TokenStream,
stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>,
) -> PrimitiveQuery {
let mut primitive_query = Vec::new();
let mut phrase = Vec::new();
let mut quoted = false;
@ -444,7 +458,9 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
let mut peekable = query.peekable();
while let Some(token) = peekable.next() {
// early return if word limit is exceeded
if primitive_query.len() >= parts_limit { return primitive_query }
if primitive_query.len() >= parts_limit {
return primitive_query;
}
match token.kind {
TokenKind::Word | TokenKind::StopWord => {
@ -454,13 +470,17 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
if quoted {
phrase.push(token.word.to_string());
} else if peekable.peek().is_some() {
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
if !stop_words
.as_ref()
.map_or(false, |swords| swords.contains(token.word.as_ref()))
{
primitive_query
.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
}
} else {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
}
},
}
TokenKind::Separator(separator_kind) => {
let quote_count = token.word.chars().filter(|&s| s == '"').count();
// swap quoted state if we encounter a double quote
@ -468,10 +488,11 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
quoted = !quoted;
}
// if there is a quote or a hard separator we close the phrase.
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) {
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
{
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
}
},
}
_ => (),
}
}
@ -486,7 +507,7 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Phrase};
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
@ -498,13 +519,12 @@ pub fn maximum_typo(operation: &Operation) -> usize {
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Phrase};
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7
},
ops.iter().map(maximum_proximity).sum::<usize>() + ops.len().saturating_sub(1) * 7
}
Query(_) | Phrase(_) => 0,
}
}
@ -515,7 +535,8 @@ mod test {
use maplit::hashmap;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::{Rng, SeedableRng, rngs::StdRng};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
@ -532,11 +553,11 @@ mod test {
authorize_typos: bool,
words_limit: Option<usize>,
query: TokenStream,
) -> Result<Option<(Operation, PrimitiveQuery)>>
{
) -> Result<Option<(Operation, PrimitiveQuery)>> {
let primitive_query = create_primitive_query(query, None, words_limit);
if !primitive_query.is_empty() {
let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?;
let qt =
create_query_tree(self, optional_words, authorize_typos, &primitive_query)?;
Ok(Some((qt, primitive_query)))
} else {
Ok(None)
@ -571,7 +592,7 @@ mod test {
}
TestContext {
synonyms: hashmap!{
synonyms: hashmap! {
vec![String::from("hello")] => vec![
vec![String::from("hi")],
vec![String::from("good"), String::from("morning")],
@ -594,7 +615,7 @@ mod test {
vec![String::from("new"), String::from("york")],
],
},
postings: hashmap!{
postings: hashmap! {
String::from("hello") => random_postings(rng, 1500),
String::from("hi") => random_postings(rng, 4000),
String::from("word") => random_postings(rng, 2500),
@ -620,15 +641,28 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "friends".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
]);
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(2, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -640,15 +674,28 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friends".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
]);
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -660,26 +707,60 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hi".to_string()) }),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hi".to_string()),
}),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("morning".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("morning".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "hello".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "hello".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("earth".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nature".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
],
),
]),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("earth".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nature".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
]),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }),
]);
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "helloworld".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -691,40 +772,95 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "yorkcity".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "yorkcity".to_string()),
}),
],
),
]),
Operation::And(vec![
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "newyork".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "newyork".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }),
]),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "newyorkcity".to_string()) }),
]),
]);
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "newyorkcity".to_string()),
}),
],
),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -736,15 +872,28 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("n".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "grams".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("n".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "grams".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }),
]);
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "ngrams".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -756,21 +905,34 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(false, vec![
Operation::Phrase(vec![
"word".to_string(),
"split".to_string(),
Operation::Or(
false,
vec![
Operation::Phrase(vec!["word".to_string(), "split".to_string()]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "wordsplit".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("fish".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("fish".to_string()) })
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }),
]);
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "wordsplitfish".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -783,14 +945,12 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -803,17 +963,12 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Phrase(vec![
"wooop".to_string(),
"wooop".to_string(),
]),
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]),
]);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -825,34 +980,80 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(true, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Or(false, vec![
let expected = Operation::Or(
true,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }),
]),
Operation::Or(false, vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "myfriend".to_string()) })
])
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "myfriend".to_string()),
}),
],
),
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }),
]),
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "heymyfriend".to_string()),
}),
],
),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -864,11 +1065,9 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -880,29 +1079,66 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(true, vec![
let expected = Operation::Or(
true,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "mygood".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "mygood".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }),
]),
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -914,14 +1150,27 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or(false, vec![
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friends".to_string()),
}),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }),
]);
let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
@ -934,14 +1183,12 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]),
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]);
let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
let (query_tree, _) =
TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}

View File

@ -1,6 +1,7 @@
use std::iter::{Chain, FromIterator};
use std::ops::RangeInclusive;
use roaring::bitmap::{RoaringBitmap, IntoIter};
use roaring::bitmap::{IntoIter, RoaringBitmap};
pub struct AvailableDocumentsIds {
iter: Chain<IntoIter, RangeInclusive<u32>>,
@ -18,16 +19,12 @@ impl AvailableDocumentsIds {
None => 1..=0, // empty range iterator
};
AvailableDocumentsIds {
iter: available.into_iter().chain(iter),
AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
}
},
None => {
let empty = RoaringBitmap::new().into_iter();
AvailableDocumentsIds {
iter: empty.chain(0..=u32::max_value()),
AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) }
}
},
}
}
}

View File

@ -1,7 +1,7 @@
use chrono::Utc;
use roaring::RoaringBitmap;
use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result};
use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64
update_id: u64,
) -> ClearDocuments<'t, 'u, 'i> {
ClearDocuments { wtxn, index, _update_id: update_id }
}
@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
mod tests {
use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test]
fn clear_documents() {

View File

@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use chrono::Utc;
use fst::IntoStreamer;
@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap;
use serde_json::Value;
use crate::error::{InternalError, FieldIdMapMissingEntry, UserError};
use super::ClearDocuments;
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key};
use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result};
use super::ClearDocuments;
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> Result<DeleteDocuments<'t, 'u, 'i>>
{
let external_documents_ids = index
.external_documents_ids(wtxn)?
.into_static();
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
let external_documents_ids = index.external_documents_ids(wtxn)?.into_static();
Ok(DeleteDocuments {
wtxn,
@ -84,11 +81,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
key: Some(main_key::PRIMARY_KEY_KEY),
}
})?;
let id_field = fields_ids_map.id(primary_key).ok_or_else(|| {
FieldIdMapMissingEntry::FieldName {
let id_field =
fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: primary_key.to_string(),
process: "DeleteDocuments::execute",
}
})?;
let Index {
@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()),
Value::Number(number) => SmallString32::from(number.to_string()),
document_id => return Err(UserError::InvalidDocumentId { document_id }.into()),
document_id => {
return Err(UserError::InvalidDocumentId { document_id }.into())
}
};
external_ids.push(external_id);
}
@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count)
Some(count) => entry.insert(count),
};
}
}
@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
// We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = words.iter().filter_map(|(word, must_remove)| {
if *must_remove { Some(word.as_ref()) } else { None }
});
let words_to_delete =
words.iter().filter_map(
|(word, must_remove)| {
if *must_remove {
Some(word.as_ref())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = {
@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We delete the documents ids that are under the pairs of words,
// it is faster and use no memory to iterate over all the words pairs than
// to compute the cartesian product of every words of the deleted documents.
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
let mut iter =
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
let mut iter =
word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the word prefix level position docids.
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
let mut iter =
word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
convert: F,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>,
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
F: Fn(K) -> DocumentId,
{
let mut iter = db.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &[field_id])?
.remap_key_type::<C>();
let mut iter =
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
while let Some(result) = iter.next() {
let (key, ()) = result?;
@ -441,8 +448,8 @@ where
mod tests {
use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test]
fn delete_documents_with_numbers_as_primary_key() {

View File

@ -3,17 +3,18 @@ use std::fs::File;
use std::num::NonZeroUsize;
use chrono::Utc;
use grenad::{CompressionType, Reader, Writer, FileFuse};
use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::error::InternalError;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, Result};
pub struct Facets<'t, 'u, 'i> {
@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> Facets<'t, 'u, 'i>
{
) -> Facets<'t, 'u, 'i> {
Facets {
wtxn,
index,
@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
)?;
// Clear the facet number levels.
clear_field_number_levels(
self.wtxn,
self.index.facet_id_f64_docids,
field_id,
)?;
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
// Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids(
@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
field_id,
)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?;
self.index.put_string_faceted_documents_ids(
self.wtxn,
field_id,
&string_documents_ids,
)?;
self.index.put_number_faceted_documents_ids(
self.wtxn,
field_id,
&number_documents_ids,
)?;
write_into_lmdb_database(
self.wtxn,
@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
}
}
fn clear_field_number_levels<'t, >(
fn clear_field_number_levels<'t>(
wtxn: &'t mut heed::RwTxn,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
field_id: u8,
) -> heed::Result<()>
{
) -> heed::Result<()> {
let left = (field_id, 1, f64::MIN, f64::MIN);
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
let range = left..=right;
@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>(
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
field_id: u8,
) -> Result<Reader<FileFuse>>
{
) -> Result<Reader<FileFuse>> {
let first_level_size = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &[field_id])?
@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>(
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
let mut writer = tempfile::tempfile()
.and_then(|file| create_writer(compression_type, compression_level, file))?;
let level_0_range = {
let left = (field_id, 0, f64::MIN, f64::MIN);
@ -196,8 +197,7 @@ fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? {
@ -215,8 +215,7 @@ fn write_number_entry(
left: f64,
right: f64,
ids: &RoaringBitmap,
) -> Result<()>
{
) -> Result<()> {
let key = (field_id, level, left, right);
let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom, BufReader, BufRead};
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
use std::num::{NonZeroU32, NonZeroUsize};
use std::result::Result as StdResult;
use std::str;
@ -10,28 +10,26 @@ use std::time::Instant;
use bstr::ByteSlice as _;
use chrono::Utc;
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
use heed::types::ByteSlice;
use log::{debug, info, error};
use log::{debug, error, info};
use memmap::Mmap;
use rayon::prelude::*;
use rayon::ThreadPool;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::error::{Error, InternalError};
use crate::{Index, Result};
use crate::update::{
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
WordPrefixPairProximityDocids,
};
use self::store::{Store, Readers};
pub use self::merge_function::{
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput};
use crate::MergeFn;
use super::UpdateBuilder;
use crate::error::{Error, InternalError};
use crate::update::{
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
};
use crate::{Index, MergeFn, Result};
mod merge_function;
mod store;
@ -48,7 +46,11 @@ pub enum WriteMethod {
GetMergePut,
}
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
pub fn create_writer(
typ: CompressionType,
level: Option<u32>,
file: File,
) -> io::Result<Writer<File>> {
let mut builder = Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
chunk_fusing_shrink_size: Option<u64>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
) -> Sorter<MergeFn<E>>
{
) -> Sorter<MergeFn<E>> {
let mut builder = Sorter::builder(merge);
if let Some(shrink_size) = chunk_fusing_shrink_size {
builder.file_fusing_shrink_size(shrink_size);
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
builder.build()
}
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> {
pub fn writer_into_reader(
writer: Writer<File>,
shrink_size: Option<u64>,
) -> Result<Reader<FileFuse>> {
let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?;
let file = if let Some(shrink_size) = shrink_size {
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
pub fn merge_readers<E>(
sources: Vec<Reader<FileFuse>>,
merge: MergeFn<E>,
) -> Merger<FileFuse, MergeFn<E>>
{
) -> Merger<FileFuse, MergeFn<E>> {
let mut builder = Merger::builder(merge);
builder.extend(sources);
builder.build()
@ -118,13 +121,7 @@ where
let before = Instant::now();
let merger = merge_readers(sources, merge);
merger_iter_into_lmdb_database(
wtxn,
database,
merger.into_merge_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
Ok(())
@ -149,7 +146,7 @@ where
while let Some((k, v)) = reader.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = reader.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -158,11 +155,11 @@ where
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
let val = merge(k, &vals)?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
}
}
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
) -> Result<()>
where
Error: From<E>,
Error: From<grenad::Error<E>>
Error: From<grenad::Error<E>>,
{
debug!("Writing MTBL sorter...");
let before = Instant::now();
merger_iter_into_lmdb_database(
wtxn,
database,
sorter.into_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(())
@ -214,7 +205,7 @@ where
while let Some((k, v)) = sorter.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = sorter.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -226,14 +217,14 @@ where
InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
},
}
}
}
Ok(())
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// Early return when there is no document to add
if reader.buffer().is_empty() {
return Ok(DocumentAdditionResult {
nb_documents: 0,
})
return Ok(DocumentAdditionResult { nb_documents: 0 });
}
self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?,
UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
};
let nb_documents = output.documents_count;
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync
F: Fn(UpdateIndexingStep) + Sync,
{
let before_indexing = Instant::now();
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool
},
}
};
let readers = pool.install(|| {
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let contains_documents = !documents_ids.is_empty();
let write_method = if contains_documents {
WriteMethod::GetMergePut
} else {
WriteMethod::Append
};
let write_method =
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
debug!("Writing using the write method: {:?}", write_method);
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.docid_word_positions.as_polymorph(),
docid_word_positions_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.documents.as_polymorph(),
documents_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
fst_merge,
WriteMethod::GetMergePut,
)?;
},
}
DatabaseType::WordDocids => {
debug!("Writing the words docids into LMDB on disk...");
let db = *self.index.word_docids.as_polymorph();
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_id_f64_docids.as_polymorph();
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FieldIdWordCountDocids => {
debug!("Writing the field id word count docids into LMDB on disk...");
let db = *self.index.field_id_word_count_docids.as_polymorph();
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph();
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
#[cfg(test)]
mod tests {
use super::*;
use heed::EnvOpenOptions;
use super::*;
#[test]
fn simple_document_replacement() {
let path = tempfile::tempdir().unwrap();
@ -1053,9 +1042,8 @@ mod tests {
assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
let (kevin_id, _) = docs.iter().find(|(_, d)| {
d.get(0).unwrap() == br#""updated kevin""#
}).unwrap();
let (kevin_id, _) =
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
let (id, doc) = docs[*kevin_id as usize];
assert_eq!(id, *kevin_id);

View File

@ -8,25 +8,29 @@ use std::{cmp, iter};
use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode;
use linked_hash_map::LinkedHashMap;
use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_KILOBYTE: usize = 1024 * 1024;
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids:
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
chunk_compression_level: Option<u32>,
chunk_fusing_shrink_size: Option<u64>,
stop_words: Option<&'s Set<A>>,
) -> Result<Self>
{
) -> Result<Self> {
// We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Some(1024 * 1024 * 1024), // 1MB
);
let documents_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let docid_word_positions_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let documents_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let docid_word_positions_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let mut config = AnalyzerConfig::default();
if let Some(stop_words) = stop_words {
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.word_docids.get_refresh(word.as_bytes()) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
let word_vec = SmallVec32::from(word.as_bytes());
// A newly inserted element is append at the end of the linked hash map.
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: OrderedFloat<f64>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: String,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Save the documents ids under the words pairs proximities that it contains.
fn insert_words_pairs_proximities_docids<'a>(
&mut self,
words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>,
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
for ((w1, w2), prox) in words_pairs_proximities {
let w1 = SmallVec32::from(w1.as_bytes());
let w2 = SmallVec32::from(w2.as_bytes());
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// if get_refresh finds the element it is assured
// to be at the end of the linked hash map.
match self.words_pairs_proximities_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
let ids = RoaringBitmap::from_iter(Some(id));
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Removing front elements is equivalent to removing the LRUs.
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
iter.take(overflow).for_each(|x| lrus.push(x));
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?;
Self::write_words_pairs_proximities(
&mut self.words_pairs_proximities_docids_sorter,
lrus,
)?;
}
Ok(())
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8],
) -> Result<()>
{
) -> Result<()> {
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
Self::write_docid_word_positions(
&mut self.docid_word_positions_writer,
document_id,
words_positions,
)?;
Self::write_word_position_docids(
&mut self.word_level_position_docids_sorter,
document_id,
words_positions,
)?;
words_positions.clear();
@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_words_pairs_proximities<E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
) -> Result<()>
where
Error: From<E>,
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
writer: &mut Writer<File>,
id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>,
) -> Result<()>
{
) -> Result<()> {
// We prefix the words by the document id.
let mut key = id.to_be_bytes().to_vec();
let mut buffer = Vec::new();
@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_string_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
Error: From<E>,
{
let mut key_buffer = Vec::new();
@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_number_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
Error: From<E>,
{
let mut data_buffer = Vec::new();
@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>,
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
Error: From<E>,
{
let mut key = Vec::new();
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
log_every_n: Option<usize>,
mut progress_callback: F,
) -> Result<Readers>
where F: FnMut(UpdateIndexingStep),
where
F: FnMut(UpdateIndexingStep),
{
debug!("{:?}: Indexing in a Store...", thread_index);
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
if count % num_threads == thread_index {
// This is a log routine that we do every `log_every_n` documents.
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
info!(
"We have seen {} documents so far ({:.02?}).",
format_count(count),
before.elapsed()
);
progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
total_documents: documents_count,
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
for (attr, content) in document.iter() {
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
{
let value =
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
let (facet_numbers, facet_strings) = extract_facet_values(&value);
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
facet_numbers_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_numbers);
facet_strings_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_strings);
if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) {
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
last_pos = Some(pos);
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
words_positions
.entry(token.text().to_string())
.or_insert_with(SmallVec32::new)
.push(position);
}
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
let key = (attr, last_pos as u8 + 1);
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
self.field_id_word_count_docids
.entry(key)
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
}
}
}
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.facet_field_string_docids,
)?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut builder = fst::SetBuilder::memory();
let mut iter = self.word_docids_sorter.into_iter()?;
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.main_sorter.write_into(&mut main_wtr)?;
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut words_pairs_proximities_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter
.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_level_position_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut field_id_word_count_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_numbers_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_strings_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_numbers_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
let mut field_id_docid_facet_strings_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter
.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let words_pairs_proximities_docids =
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids =
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids =
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids =
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids =
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers =
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings =
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions =
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
Ok(Readers {
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
/// close to each other.
fn compute_words_pair_proximities(
word_positions: &HashMap<String, SmallVec32<Position>>,
) -> HashMap<(&str, &str), u8>
{
) -> HashMap<(&str, &str), u8> {
use itertools::Itertools;
let mut words_pair_proximities = HashMap::new();
@ -828,7 +875,9 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
/// take an iterator on tokens and compute their relative position depending on separator kinds
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standart proximity of 1 between words.
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
fn process_tokens<'a>(
tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| token.is_separator().is_some())
.scan((0, None), |(offset, prev_kind), token| {
@ -845,7 +894,8 @@ fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
{
*prev_kind = Some(token.kind);
}
_ => (),
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
match value {
Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()),
Value::Number(number) => if let Some(float) = number.as_f64() {
Value::Number(number) => {
if let Some(float) = number.as_f64() {
output_numbers.push(float);
},
}
}
Value::String(string) => {
let string = string.trim().to_lowercase();
output_strings.push(string);
},
Value::Array(values) => if can_recurse {
}
Value::Array(values) => {
if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
},
}
}
Value::Object(_) => (),
}
}

View File

@ -10,14 +10,15 @@ use log::info;
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use crate::error::{Error, UserError, InternalError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
use crate::{Index, Result};
use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod};
use super::{create_sorter, create_writer, IndexDocumentsMethod};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
};
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
self.output_from_generic_json(reader, false, progress_callback)
}
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
pub fn output_from_json_stream<R, F>(
self,
reader: R,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
@ -86,14 +91,16 @@ impl Transform<'_, '_> {
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
// Deserialize the whole batch of documents in memory.
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
} else {
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
let iter = vec.into_iter().map(Ok);
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
};
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
Err(_) => {
let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into());
},
}
};
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
if documents.peek().is_none() {
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()),
content => return Err(UserError::InvalidDocumentId {
document_id: content.clone(),
}.into()),
content => {
return Err(
UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
}
},
None => {
if !self.autogenerate_docids {
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
}
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid)
},
}
};
// We iterate in the fields ids ordered.
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
// and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) {
// We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?;
}
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into());
}
.into());
}
}
@ -250,7 +262,7 @@ impl Transform<'_, '_> {
Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
},
}
None => headers.iter().position(is_primary_key),
};
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
// The primary key field is not present in the header, so we need to create it.
@ -308,18 +320,20 @@ impl Transform<'_, '_> {
// We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) {
Some(valid) => valid,
None => return Err(UserError::InvalidDocumentId {
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into()),
}
},
.into())
}
}
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
};
// When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field.
let iter = fields_ids.iter()
.map(|(fi, i)| {
let iter = fields_ids.iter().map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
});
@ -328,7 +342,8 @@ impl Transform<'_, '_> {
for (field_id, field) in iter {
// We serialize the attribute values as JSON strings.
json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?;
}
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)?
.ok_or(InternalError::DatabaseMissingEntry {
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
},
)?;
let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice())
}
}
},
}
None => {
// If this user id is new we add it to the external documents ids map
// for new ids and into the list of new documents.
let new_docid = available_documents_ids.next()
.ok_or(UserError::DocumentLimitReached)?;
let new_docid =
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
new_documents_ids.insert(new_docid);
(new_docid, update_obkv)
},
}
};
// We insert the document under the documents ids map into the final file.
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
// Once we have written all the documents into the final sorter, we write the documents
// into this writer, extract the file and reset the seek to be able to read it again.
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
primary_key: String,
old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput>
{
) -> Result<TransformOutput> {
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?;
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut obkv_buffer = Vec::new();
for result in self.index.documents.iter(self.rtxn)? {
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
return Err(UserError::MissingPrimaryKey.into());
}
DEFAULT_PRIMARY_KEY_NAME.to_string()
},
}
};
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
Ok((id, name))
},
}
}
}
fn validate_document_id(document_id: &str) -> Option<&str> {
let document_id = document_id.trim();
Some(document_id).filter(|id| {
!id.is_empty() && id.chars().all(|c| {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')
})
!id.is_empty()
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
})
}
@ -583,8 +599,7 @@ mod test {
use super::*;
mod compute_primary_key {
use super::compute_primary_key_pair;
use super::FieldsIdsMap;
use super::{compute_primary_key_pair, FieldsIdsMap};
#[test]
fn should_return_primary_key_if_is_some() {
@ -594,7 +609,8 @@ mod test {
Some("toto"),
&mut fields_map,
Some("tata".to_string()),
false);
false,
);
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -602,11 +618,8 @@ mod test {
#[test]
fn should_return_alternative_if_primary_is_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
Some("tata".to_string()),
false);
let result =
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -614,23 +627,15 @@ mod test {
#[test]
fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
true);
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_err_if_both_are_none_and_recompute_is_false(){
fn should_return_err_if_both_are_none_and_recompute_is_false() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
false);
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
assert!(result.is_err());
assert_eq!(fields_map.len(), 0);
}

View File

@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::facets::Facets;
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat};
pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
};
pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;

View File

@ -34,17 +34,24 @@ impl<T> Setting<T> {
}
impl<T: Serialize> Serialize for Setting<T> {
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> where S: Serializer {
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error>
where
S: Serializer,
{
match self {
Self::Set(value) => Some(value),
// Usually not_set isn't serialized by setting skip_serializing_if field attribute
Self::NotSet | Self::Reset => None,
}.serialize(serializer)
}
.serialize(serializer)
}
}
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> where D: Deserializer<'de> {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
where
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(|x| match x {
Some(x) => Self::Set(x),
None => Self::Reset, // Reset is forced by sending null value
@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
self.stop_words = if stop_words.is_empty() {
Setting::Reset
} else {
Setting::Set(stop_words)
}
self.stop_words =
if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
}
pub fn reset_distinct_field(&mut self) {
@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
self.synonyms = if synonyms.is_empty() {
Setting::Reset
} else {
Setting::Set(synonyms)
}
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
}
pub fn reset_primary_key(&mut self) {
@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where
F: Fn(UpdateIndexingStep, u64) + Sync
F: Fn(UpdateIndexingStep, u64) + Sync,
{
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let update_id = self.update_id;
@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
};
// There already has been a document addition, the primary key should be set by now.
let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
let primary_key =
self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
// We remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.remap_index_documents(
@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account
let names: Vec<_> = fields
.iter()
.unique()
.map(String::as_str)
.collect();
let names: Vec<_> = fields.iter().unique().map(String::as_str).collect();
for name in names.iter() {
fields_ids_map
.insert(name)
.ok_or(UserError::AttributeLimitReached)?;
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_displayed_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_displayed_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.distinct_field {
Setting::Set(ref attr) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map
.insert(attr)
.ok_or(UserError::AttributeLimitReached)?;
fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?;
self.index.put_distinct_field(self.wtxn, &attr)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; },
Setting::Reset => {
self.index.delete_distinct_field(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let mut new_fields_ids_map = FieldsIdsMap::new();
// fields are deduplicated, only the first occurrence is taken into account
let names = fields
.iter()
.unique()
.map(String::as_str)
.collect::<Vec<_>>();
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one
for name in names.iter() {
new_fields_ids_map
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
}
for (_, name) in old_fields_ids_map.iter() {
new_fields_ids_map
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_searchable_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
}
Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_searchable_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let fst = fst::Set::from_iter(stop_words)?;
// Does the new FST differ from the previous one?
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
if current
.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes())
{
// we want to re-create our FST.
self.index.put_stop_words(self.wtxn, &fst)?;
Ok(true)
@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
analyzer
.analyze(text)
.tokens()
.filter_map(|token|
if token.is_word() { Some(token.text().to_string()) } else { None }
)
.filter_map(|token| {
if token.is_word() {
Some(token.text().to_string())
} else {
None
}
})
.collect::<Vec<_>>()
}
@ -360,22 +357,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms.
let normalized_word = normalize(&analyzer, word);
let normalized_synonyms = synonyms
.iter()
.map(|synonym| normalize(&analyzer, synonym));
let normalized_synonyms =
synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
// Store the normalized synonyms under the normalized word,
// merging the possible duplicate words.
let entry = new_synonyms
.entry(normalized_word)
.or_insert_with(Vec::new);
let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
entry.extend(normalized_synonyms);
}
// Make sure that we don't have duplicate synonyms.
new_synonyms
.iter_mut()
.for_each(|(_, synonyms)| {
new_synonyms.iter_mut().for_each(|(_, synonyms)| {
synonyms.sort_unstable();
synonyms.dedup();
});
@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_filterable_fields(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_criteria(self.wtxn, &new_criteria)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_criteria(self.wtxn)?; }
Setting::Reset => {
self.index.delete_criteria(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else {
Err(UserError::PrimaryKeyCannotBeChanged.into())
}
},
}
Setting::Reset => {
if self.index.number_of_documents(&self.wtxn)? == 0 {
self.index.delete_primary_key(self.wtxn)?;
@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else {
Err(UserError::PrimaryKeyCannotBeReset.into())
}
},
}
Setting::NotSet => Ok(()),
}
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep, u64) + Sync
F: Fn(UpdateIndexingStep, u64) + Sync,
{
self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)]
mod tests {
use heed::EnvOpenOptions;
use heed::types::ByteSlice;
use maplit::{btreeset, hashmap, hashset};
use big_s::S;
use heed::types::ByteSlice;
use heed::EnvOpenOptions;
use maplit::{btreeset, hashmap, hashset};
use super::*;
use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::{Criterion, FilterCondition, SearchResult};
use super::*;
#[test]
fn set_and_reset_searchable_fields() {
let path = tempfile::tempdir().unwrap();
@ -674,7 +669,7 @@ mod tests {
// Set the filterable fields to be the age.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_filterable_fields(hashset!{ S("age") });
builder.set_filterable_fields(hashset! { S("age") });
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
@ -692,12 +687,15 @@ mod tests {
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let fields_ids = index.filterable_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashset!{ S("age") });
assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 3);
drop(rtxn);
@ -718,9 +716,12 @@ mod tests {
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 4);
}
@ -969,7 +970,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_displayed_fields(vec!["hello".to_string()]);
builder.set_filterable_fields(hashset!{ S("age"), S("toto") });
builder.set_filterable_fields(hashset! { S("age"), S("toto") });
builder.set_criteria(vec!["asc(toto)".to_string()]);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -1,8 +1,8 @@
use grenad::CompressionType;
use rayon::ThreadPool;
use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings};
use crate::{Index, Result};
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
pub struct UpdateBuilder<'a> {
pub(crate) log_every_n: Option<usize>,
@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> ClearDocuments<'t, 'u, 'i>
{
) -> ClearDocuments<'t, 'u, 'i> {
ClearDocuments::new(wtxn, index, self.update_id)
}
@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Result<DeleteDocuments<'t, 'u, 'i>>
{
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
DeleteDocuments::new(wtxn, index, self.update_id)
}
@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> IndexDocuments<'t, 'u, 'i, 'a>
{
) -> IndexDocuments<'t, 'u, 'i, 'a> {
let mut builder = IndexDocuments::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n;
@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Settings<'a, 't, 'u, 'i>
{
) -> Settings<'a, 't, 'u, 'i> {
let mut builder = Settings::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n;
@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Facets<'t, 'u, 'i>
{
) -> Facets<'t, 'u, 'i> {
let mut builder = Facets::new(wtxn, index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type;

View File

@ -1,15 +1,13 @@
use std::str;
use crate::Index;
use fst::Streamer;
use grenad::CompressionType;
use heed::types::ByteSlice;
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database,
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod,
};
use crate::{Index, Result};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
}
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids {
wtxn,
index,

View File

@ -1,18 +1,17 @@
use std::str;
use fst::automaton::{Automaton, Str};
use fst::{Streamer, IntoStreamer};
use fst::{IntoStreamer, Streamer};
use grenad::CompressionType;
use heed::BytesEncode;
use heed::types::ByteSlice;
use heed::BytesEncode;
use log::debug;
use crate::{Index, Result};
use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{
WriteMethod, create_sorter, sorter_into_lmdb_database,
cbo_roaring_bitmap_merge,
cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
};
use crate::{Index, Result};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
{
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
WordPrefixPairProximityDocids {
wtxn,
index,

View File

@ -1,25 +1,23 @@
use std::{cmp, str};
use std::convert::TryFrom;
use std::fs::File;
use std::num::NonZeroU32;
use std::{cmp, str};
use fst::automaton::{self, Automaton};
use fst::{Streamer, IntoStreamer};
use grenad::{CompressionType, Reader, Writer, FileFuse};
use fst::{IntoStreamer, Streamer};
use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::error::InternalError;
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
use crate::update::index_documents::{
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
cbo_roaring_bitmap_merge, sorter_into_lmdb_database
cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, TreeLevel};
use crate::{Index, Result, TreeLevel};
pub struct WordsLevelPositions<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
}
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordsLevelPositions<'t, 'u, 'i> {
WordsLevelPositions {
wtxn,
index,
@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(),
entries,
|_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }),
|_, _| {
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })
},
WriteMethod::Append,
)?;
@ -176,13 +179,11 @@ fn compute_positions_levels(
shrink_size: Option<u64>,
level_group_size: NonZeroU32,
min_level_size: NonZeroU32,
) -> Result<Reader<FileFuse>>
{
) -> Result<Reader<FileFuse>> {
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
let mut writer = tempfile::tempfile()
.and_then(|file| create_writer(compression_type, compression_level, file))?;
for result in words_db.iter(rtxn)? {
let (word, ()) = result?;
@ -193,7 +194,8 @@ fn compute_positions_levels(
left..=right
};
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
let first_level_size = words_positions_db
.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)?
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
@ -253,8 +255,7 @@ fn write_level_entry(
left: u32,
right: u32,
ids: &RoaringBitmap,
) -> Result<()>
{
) -> Result<()> {
let key = (word, level, left, right);
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -2,7 +2,8 @@ use std::iter::FromIterator;
use std::str;
use fst::Streamer;
use crate::{Index, SmallString32, Result};
use crate::{Index, Result, SmallString32};
pub struct WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> WordsPrefixesFst<'t, 'u, 'i>
{
) -> WordsPrefixesFst<'t, 'u, 'i> {
WordsPrefixesFst {
wtxn,
index,
@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
for n in 1..=self.max_prefix_length {
let mut current_prefix = SmallString32::new();
let mut current_prefix_count = 0;
let mut builder = fst::SetBuilder::memory();

View File

@ -1,9 +1,8 @@
use milli::{Criterion, Index, DocumentId};
use milli::update::{IndexDocuments, UpdateFormat, Settings};
use big_s::S;
use heed::EnvOpenOptions;
use maplit::{hashmap, hashset};
use milli::update::{IndexDocuments, Settings, UpdateFormat};
use milli::{Criterion, DocumentId, Index};
use serde::Deserialize;
use slice_group_by::GroupBy;
@ -11,7 +10,8 @@ mod query_criteria;
pub const TEST_QUERY: &'static str = "hello world america";
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] =
&["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
pub const CONTENT: &str = include_str!("../assets/test_set.ndjson");
@ -27,16 +27,16 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let criteria = criteria.iter().map(|c| c.to_string()).collect();
builder.set_criteria(criteria);
builder.set_filterable_fields(hashset!{
builder.set_filterable_fields(hashset! {
S("tag"),
S("asc_desc_rank"),
});
builder.set_synonyms(hashmap!{
builder.set_synonyms(hashmap! {
S("hello") => vec![S("good morning")],
S("world") => vec![S("earth")],
S("america") => vec![S("the united states")],
});
builder.set_searchable_fields(vec![S("title"),S("description")]);
builder.set_searchable_fields(vec![S("title"), S("description")]);
builder.execute(|_, _| ()).unwrap();
// index documents
@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> {
let mut rtxn = index.read_txn().unwrap();
let docid_map = index.external_documents_ids(&mut rtxn).unwrap();
let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
let docid_map: std::collections::HashMap<_, _> =
EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect()
}
pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec<TestDocument> {
let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
pub fn expected_order(
criteria: &[Criterion],
authorize_typo: bool,
optional_words: bool,
) -> Vec<TestDocument> {
let dataset =
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
let mut groups: Vec<Vec<TestDocument>> = vec![dataset];
for criterion in criteria {
@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor
match criterion {
Criterion::Attribute => {
group.sort_by_key(|d| d.attribute_rank);
new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
},
new_groups
.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
}
Criterion::Exactness => {
group.sort_by_key(|d| d.exact_rank);
new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from));
},
}
Criterion::Proximity => {
group.sort_by_key(|d| d.proximity_rank);
new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
},
new_groups
.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
}
Criterion::Typo => {
group.sort_by_key(|d| d.typo_rank);
new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from));
},
}
Criterion::Words => {
group.sort_by_key(|d| d.word_rank);
new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from));
},
}
Criterion::Asc(field_name) if field_name == "asc_desc_rank" => {
group.sort_by_key(|d| d.asc_desc_rank);
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
},
new_groups
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
Criterion::Desc(field_name) if field_name == "asc_desc_rank" => {
group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank));
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
},
new_groups
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()),
}
}

View File

@ -1,9 +1,9 @@
use big_s::S;
use milli::update::Settings;
use milli::{Search, SearchResult, Criterion};
use milli::{Criterion, Search, SearchResult};
use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
use Criterion::*;
const ALLOW_TYPOS: bool = true;
const DISALLOW_TYPOS: bool = false;
@ -35,29 +35,54 @@ macro_rules! test_criterion {
}
}
#[rustfmt::skip]
test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS);
#[rustfmt::skip]
test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS);
#[rustfmt::skip]
test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words);
#[rustfmt::skip]
test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute);
#[rustfmt::skip]
test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute);
#[rustfmt::skip]
test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness);
#[rustfmt::skip]
test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness);
#[rustfmt::skip]
test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity);
#[rustfmt::skip]
test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity);
#[rustfmt::skip]
test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field")));
#[test]
fn criteria_mixup() {
use Criterion::*;
let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]);
let index = search::setup_search_index_with_criteria(&vec![
Words,
Attribute,
Desc(S("asc_desc_rank")),
Exactness,
Proximity,
Typo,
]);
#[rustfmt::skip]
let criteria_mix = {
// Criterion doesn't implement Copy, we create a new Criterion using a closure
let desc = || Desc(S("asc_desc_rank"));
@ -205,7 +230,8 @@ fn criteria_mixup() {
let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS)
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS)
.into_iter()
.map(|d| d.id)
.collect();

36
script/pre-commit Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
cargo check --workspace --all-targets &>/dev/null
result=$?
if [[ ${result} -ne 0 ]] ; then
cat <<\EOF
The project does not compile. You might want to fix your error before commiting.
If you still want to commit you can do it by appending
--no-verify
at the end of your previous command.
If you are running a variant of bash you can directly paste this command in your terminal:
!! --no-verify
EOF
exit 1
fi
cargo fmt --all -- --check &>/dev/null
result=$?
if [[ ${result} -ne 0 ]] ; then
cat <<\EOF
The project is badly formatted. Please run:
cargo fmt --all
If you want to create your commit without propper formatting you can add
--no-verify
at the end of your commit.
If you are running a variant of bash you can directly paste this command in your terminal:
!! --no-verify
EOF
exit 1
fi

View File

@ -6,10 +6,9 @@ use std::time::Instant;
use byte_unit::Byte;
use heed::EnvOpenOptions;
use log::debug;
use milli::{obkv_to_json, Index};
use structopt::StructOpt;
use milli::{Index, obkv_to_json};
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> {
}
if opt.print_facet_distribution {
let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
let facets =
index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
serde_json::to_writer(&mut stdout, &facets)?;
let _ = writeln!(&mut stdout);
}