From 9716fb3b361eb76ece836c19fec9589abb650427 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Jun 2021 18:33:33 +0200 Subject: [PATCH] format the whole project --- .rustfmt.toml | 5 + README.md | 15 + benchmarks/benches/songs.rs | 58 +- benchmarks/benches/utils.rs | 6 +- benchmarks/benches/wiki.rs | 19 +- benchmarks/build.rs | 19 +- helpers/src/main.rs | 5 +- http-ui/src/main.rs | 717 ++++++++--------- http-ui/src/update_store.rs | 69 +- infos/src/main.rs | 203 +++-- milli/src/criterion.rs | 37 +- milli/src/error.rs | 33 +- milli/src/external_documents_ids.rs | 10 +- milli/src/facet/facet_type.rs | 7 +- milli/src/facet/facet_value.rs | 2 +- milli/src/facet/value_encoding.rs | 5 +- milli/src/fields_ids_map.rs | 16 +- .../facet/facet_level_value_f64_codec.rs | 3 +- .../facet/field_doc_id_facet_f64_codec.rs | 2 +- .../facet/field_doc_id_facet_string_codec.rs | 9 +- .../heed_codec/field_id_word_count_codec.rs | 3 +- milli/src/heed_codec/mod.rs | 10 +- milli/src/heed_codec/obkv_codec.rs | 1 + .../cbo_roaring_bitmap_codec.rs | 4 +- .../roaring_bitmap/roaring_bitmap_codec.rs | 1 + .../roaring_bitmap_len_codec.rs | 20 +- .../heed_codec/str_level_position_codec.rs | 4 +- milli/src/index.rs | 298 ++++--- milli/src/lib.rs | 33 +- milli/src/proximity.rs | 8 +- milli/src/search/criteria/asc_desc.rs | 87 +-- milli/src/search/criteria/attribute.rs | 422 ++++++---- milli/src/search/criteria/exactness.rs | 163 ++-- milli/src/search/criteria/final.rs | 24 +- milli/src/search/criteria/initial.rs | 11 +- milli/src/search/criteria/mod.rs | 314 +++++--- milli/src/search/criteria/proximity.rs | 283 ++++--- milli/src/search/criteria/typo.rs | 369 +++++---- milli/src/search/criteria/words.rs | 52 +- milli/src/search/distinct/facet_distinct.rs | 26 +- milli/src/search/distinct/mod.rs | 21 +- milli/src/search/distinct/noop_distinct.rs | 10 +- milli/src/search/facet/facet_distribution.rs | 71 +- milli/src/search/facet/filter_condition.rs | 279 ++++--- milli/src/search/facet/mod.rs | 57 +- milli/src/search/facet/parser.rs | 2 +- milli/src/search/matching_words.rs | 53 +- milli/src/search/mod.rs | 40 +- milli/src/search/query_tree.rs | 735 ++++++++++++------ milli/src/update/available_documents_ids.rs | 15 +- milli/src/update/clear_documents.rs | 7 +- milli/src/update/delete_documents.rs | 59 +- milli/src/update/facets.rs | 49 +- milli/src/update/index_documents/mod.rs | 114 ++- milli/src/update/index_documents/store.rs | 260 ++++--- milli/src/update/index_documents/transform.rs | 143 ++-- milli/src/update/mod.rs | 4 +- milli/src/update/settings.rs | 149 ++-- milli/src/update/update_builder.rs | 17 +- milli/src/update/word_prefix_docids.rs | 11 +- .../word_prefix_pair_proximity_docids.rs | 12 +- milli/src/update/words_level_positions.rs | 39 +- milli/src/update/words_prefixes_fst.rs | 7 +- milli/tests/search/mod.rs | 54 +- milli/tests/search/query_criteria.rs | 40 +- script/pre-commit | 36 + qc_loop.sh => script/qc_loop.sh | 0 search/src/main.rs | 6 +- 68 files changed, 3327 insertions(+), 2336 deletions(-) create mode 100644 .rustfmt.toml create mode 100755 script/pre-commit rename qc_loop.sh => script/qc_loop.sh (100%) diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 000000000..250124b77 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,5 @@ +unstable_features = true + +use_small_heuristics = "max" +imports_granularity = "Module" +group_imports = "StdExternalCrate" diff --git a/README.md b/README.md index 13d35380a..b1498d0f5 100644 --- a/README.md +++ b/README.md @@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head ### Querying the engine via the website You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). + + +## Contributing + +You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if: +- Any of the workspaces does not build +- Your code is not well-formatted + +These two things are also checked in the CI, so ignoring the hook won't help you merge your code. +But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook. + +To enable the hook, run the following command from the root of the project: +``` +cp script/pre-commit .git/hooks/pre-commit +``` diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index e5da16a99..726190f77 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -6,33 +6,24 @@ use milli::update::Settings; use utils::Conf; fn base_conf(builder: &mut Settings) { - let displayed_fields = [ - "id", "title", "album", "artist", "genre", "country", "released", "duration", - ] - .iter() - .map(|s| s.to_string()) - .collect(); + let displayed_fields = + ["id", "title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); builder.set_displayed_fields(displayed_fields); - let searchable_fields = ["title", "album", "artist"] + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] .iter() .map(|s| s.to_string()) .collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = [ - "released-timestamp", - "duration-float", - "genre", - "country", - "artist", - ] - .iter() - .map(|s| s.to_string()) - .collect(); builder.set_filterable_fields(faceted_fields); } +#[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_SONGS, queries: &[ @@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf { }; fn bench_songs(c: &mut criterion::Criterion) { - let default_criterion: Vec = milli::default_criteria() - .iter() - .map(|criteria| criteria.to_string()) - .collect(); + let default_criterion: Vec = + milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") - .chain(default_criterion.clone()) - .collect(); - let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") - .chain(default_criterion.clone()) - .collect(); + let asc_default: Vec<&str> = + std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = + std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); let basic_with_quote: Vec = BASE_CONF .queries .iter() .map(|s| { - s.trim() - .split(' ') - .map(|s| format!(r#""{}""#, s)) - .collect::>() - .join(" ") + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote - .iter() - .map(|s| s.as_str()) - .collect::>(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + #[rustfmt::skip] let confs = &[ /* first we bench each criterion alone */ utils::Conf { diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index d5181849f..fd1df0a90 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -3,10 +3,8 @@ use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::{ - update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, - FilterCondition, Index, -}; +use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}; +use milli::{FilterCondition, Index}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/wiki.rs index 11ffe87d5..3d8b6f1d4 100644 --- a/benchmarks/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -6,16 +6,14 @@ use milli::update::Settings; use utils::Conf; fn base_conf(builder: &mut Settings) { - let displayed_fields = ["title", "body", "url"] - .iter() - .map(|s| s.to_string()) - .collect(); + let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); builder.set_displayed_fields(displayed_fields); let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); } +#[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_WIKI_ARTICLES, queries: &[ @@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) { .queries .iter() .map(|s| { - s.trim() - .split(' ') - .map(|s| format!(r#""{}""#, s)) - .collect::>() - .join(" ") + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote - .iter() - .map(|s| s.as_str()) - .collect::>(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + #[rustfmt::skip] let confs = &[ /* first we bench each criterion alone */ utils::Conf { diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 58300bab9..b1edd5499 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -1,9 +1,7 @@ +use std::fs::File; +use std::io::{Cursor, Read, Seek, Write}; use std::path::{Path, PathBuf}; use std::{env, fs}; -use std::{ - fs::File, - io::{Cursor, Read, Seek, Write}, -}; use bytes::Bytes; use convert_case::{Case, Casing}; @@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> { )?; if out_file.exists() { - eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + eprintln!( + "The dataset {} already exists on the file system and will not be downloaded again", + dataset + ); continue; } let url = format!("{}/{}.csv.gz", BASE_URL, dataset); @@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> { } fn download_dataset(url: U) -> anyhow::Result> { - let bytes = reqwest::blocking::Client::builder() - .timeout(None) - .build()? - .get(url) - .send()? - .bytes()?; + let bytes = + reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?; Ok(Cursor::new(bytes)) } diff --git a/helpers/src/main.rs b/helpers/src/main.rs index c916d0448..b325aef89 100644 --- a/helpers/src/main.rs +++ b/helpers/src/main.rs @@ -1,9 +1,8 @@ use std::path::PathBuf; use byte_unit::Byte; -use heed::{Env, EnvOpenOptions, CompactionOption}; +use heed::{CompactionOption, Env, EnvOpenOptions}; use structopt::StructOpt; - use Command::*; #[cfg(target_os = "linux")] @@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> { use CompactionOption::*; let compaction = if enable_compaction { Enabled } else { Disabled }; copy_main_database_to_stdout(env, compaction) - }, + } } } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index e23dddd4c..703861058 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,6 +1,5 @@ mod update_store; -use std::{io, mem}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; @@ -10,16 +9,19 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::time::Instant; +use std::{io, mem}; use askama_warp::Template; use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; -use futures::{FutureExt, StreamExt}; -use futures::stream; +use futures::{stream, FutureExt, StreamExt}; use grenad::CompressionType; use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use milli::update::UpdateIndexingStep::*; +use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; +use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult}; use once_cell::sync::OnceCell; use rayon::ThreadPool; use serde::{Deserialize, Serialize}; @@ -28,12 +30,9 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; -use warp::{Filter, http::Response}; use warp::filters::ws::Message; - -use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult}; -use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; -use milli::update::UpdateIndexingStep::*; +use warp::http::Response; +use warp::Filter; use self::update_store::UpdateStore; @@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { for (word, token) in analyzed.reconstruct() { if token.is_word() { let to_highlight = matching_words.matching_bytes(token.text()).is_some(); - if to_highlight { string.push_str("") } + if to_highlight { + string.push_str("") + } string.push_str(word); - if to_highlight { string.push_str("") } + if to_highlight { + string.push_str("") + } } else { string.push_str(word); } } Value::String(string) } - Value::Array(values) => { - Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, matching_words)) - .collect()) - } - Value::Object(object) => { - Value::Object(object.into_iter() + Value::Array(values) => Value::Array( + values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), + ), + Value::Object(object) => Value::Object( + object + .into_iter() .map(|(k, v)| (k, self.highlight_value(v, matching_words))) - .collect()) - } + .collect(), + ), } } @@ -236,12 +238,7 @@ enum UpdateMeta { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type")] enum UpdateMetaProgress { - DocumentsAddition { - step: usize, - total_steps: usize, - current: usize, - total: Option, - }, + DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option }, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -342,157 +339,185 @@ async fn main() -> anyhow::Result<()> { update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); - update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); + update_builder + .chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); let before_update = Instant::now(); // we extract the update type and execute the update itself. - let result: anyhow::Result<()> = match meta { - UpdateMeta::DocumentsAddition { method, format, encoding } => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); + let result: anyhow::Result<()> = + match meta { + UpdateMeta::DocumentsAddition { method, format, encoding } => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); - match format.as_str() { - "csv" => builder.update_format(UpdateFormat::Csv), - "json" => builder.update_format(UpdateFormat::Json), - "json-stream" => builder.update_format(UpdateFormat::JsonStream), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - - match method.as_str() { - "replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments), - "update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments), - otherwise => panic!("invalid indexing method {:?}", otherwise), - }; - - let reader = match encoding.as_deref() { - Some("gzip") => Box::new(GzDecoder::new(content)), - None => Box::new(content) as Box, - otherwise => panic!("invalid encoding format {:?}", otherwise), - }; - - let result = builder.execute(reader, |indexing_step, update_id| { - let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + match format.as_str() { + "csv" => builder.update_format(UpdateFormat::Csv), + "json" => builder.update_format(UpdateFormat::Json), + "json-stream" => builder.update_format(UpdateFormat::JsonStream), + otherwise => panic!("invalid update format {:?}", otherwise), }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }); - match result { - Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::ClearDocuments => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); - - match builder.execute() { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::Settings(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.settings(&mut wtxn, &index_cloned); - - // We transpose the settings JSON struct into a real setting update. - match settings.searchable_attributes { - Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes), - Setting::Reset => builder.reset_searchable_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.displayed_attributes { - Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes), - Setting::Reset => builder.reset_displayed_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.filterable_attributes { - Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes), - Setting::Reset => builder.reset_filterable_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.criteria { - Setting::Set(criteria) => builder.set_criteria(criteria), - Setting::Reset => builder.reset_criteria(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.stop_words { - Setting::Set(stop_words) => builder.set_stop_words(stop_words), - Setting::Reset => builder.reset_stop_words(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.synonyms { - Setting::Set(synonyms) => builder.set_synonyms(synonyms), - Setting::Reset => builder.reset_synonyms(), - Setting::NotSet => () - } - - let result = builder.execute(|indexing_step, update_id| { - let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + match method.as_str() { + "replace" => builder + .index_documents_method(IndexDocumentsMethod::ReplaceDocuments), + "update" => builder + .index_documents_method(IndexDocumentsMethod::UpdateDocuments), + otherwise => panic!("invalid indexing method {:?}", otherwise), }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }); - match result { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), + let reader = match encoding.as_deref() { + Some("gzip") => Box::new(GzDecoder::new(content)), + None => Box::new(content) as Box, + otherwise => panic!("invalid encoding format {:?}", otherwise), + }; + + let result = builder.execute(reader, |indexing_step, update_id| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => { + (documents_seen, None) + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) + } + }; + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, + }); + }); + + match result { + Ok(_) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - } - UpdateMeta::Facets(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.facets(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); + UpdateMeta::ClearDocuments => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); + + match builder.execute() { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); + UpdateMeta::Settings(settings) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.settings(&mut wtxn, &index_cloned); + + // We transpose the settings JSON struct into a real setting update. + match settings.searchable_attributes { + Setting::Set(searchable_attributes) => { + builder.set_searchable_fields(searchable_attributes) + } + Setting::Reset => builder.reset_searchable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.displayed_attributes { + Setting::Set(displayed_attributes) => { + builder.set_displayed_fields(displayed_attributes) + } + Setting::Reset => builder.reset_displayed_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.filterable_attributes { + Setting::Set(filterable_attributes) => { + builder.set_filterable_fields(filterable_attributes) + } + Setting::Reset => builder.reset_filterable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.criteria { + Setting::Set(criteria) => builder.set_criteria(criteria), + Setting::Reset => builder.reset_criteria(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.stop_words { + Setting::Set(stop_words) => builder.set_stop_words(stop_words), + Setting::Reset => builder.reset_stop_words(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => (), + } + + let result = builder.execute(|indexing_step, update_id| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => { + (documents_seen, None) + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) + } + }; + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, + }); + }); + + match result { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), + UpdateMeta::Facets(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.facets(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); + } + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - } - }; + }; let meta = match result { - Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()), + Ok(()) => { + format!("valid update content processed in {:.02?}", before_update.elapsed()) + } Err(e) => format!("error while processing update content: {:?}", e), }; @@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> { let _ = update_status_sender_cloned.send(processed); Ok(meta) - })?; + }, + )?; // The database name will not change. let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); @@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> { let db_name_cloned = db_name.clone(); let lmdb_path_cloned = lmdb_path.clone(); let index_cloned = index.clone(); - let dash_html_route = warp::filters::method::get() - .and(warp::filters::path::end()) - .map(move || { + let dash_html_route = + warp::filters::method::get().and(warp::filters::path::end()).map(move || { // We retrieve the database size. - let db_size = File::open(lmdb_path_cloned.clone()) - .unwrap() - .metadata() - .unwrap() - .len() as usize; + let db_size = + File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize; // And the number of documents in the database. let rtxn = index_cloned.read_txn().unwrap(); @@ -537,111 +559,105 @@ async fn main() -> anyhow::Result<()> { .and(warp::path!("updates")) .map(move |header: String| { let update_store = update_store_cloned.clone(); - let mut updates = update_store.iter_metas(|processed, aborted, pending| { - let mut updates = Vec::>::new(); - for result in processed { - let (uid, meta) = result?; - updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); - } - for result in aborted { - let (uid, meta) = result?; - updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); - } - for result in pending { - let (uid, meta) = result?; - updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); - } - Ok(updates) - }).unwrap(); + let mut updates = update_store + .iter_metas(|processed, aborted, pending| { + let mut updates = Vec::>::new(); + for result in processed { + let (uid, meta) = result?; + updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); + } + for result in aborted { + let (uid, meta) = result?; + updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); + } + for result in pending { + let (uid, meta) = result?; + updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); + } + Ok(updates) + }) + .unwrap(); updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); if header.contains("text/html") { // We retrieve the database size. - let db_size = File::open(lmdb_path_cloned.clone()) - .unwrap() - .metadata() - .unwrap() - .len() as usize; + let db_size = + File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() + as usize; // And the number of documents in the database. let rtxn = index_cloned.read_txn().unwrap(); let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; - let template = UpdatesTemplate { - db_name: db_name.clone(), - db_size, - docs_count, - updates, - }; + let template = + UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates }; Box::new(template) as Box } else { Box::new(warp::reply::json(&updates)) } }); - let dash_bulma_route = warp::filters::method::get() - .and(warp::path!("bulma.min.css")) - .map(|| Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma.min.css")) - ); + let dash_bulma_route = + warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| { + Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../public/bulma.min.css")) + }); - let dash_bulma_dark_route = warp::filters::method::get() - .and(warp::path!("bulma-prefers-dark.min.css")) - .map(|| Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma-prefers-dark.min.css")) - ); + let dash_bulma_dark_route = + warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| { + Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../public/bulma-prefers-dark.min.css")) + }); - let dash_style_route = warp::filters::method::get() - .and(warp::path!("style.css")) - .map(|| Response::builder() + let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| { + Response::builder() .header("content-type", "text/css; charset=utf-8") .body(include_str!("../public/style.css")) - ); + }); - let dash_jquery_route = warp::filters::method::get() - .and(warp::path!("jquery-3.4.1.min.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/jquery-3.4.1.min.js")) - ); + let dash_jquery_route = + warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/jquery-3.4.1.min.js")) + }); - let dash_filesize_route = warp::filters::method::get() - .and(warp::path!("filesize.min.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/filesize.min.js")) - ); + let dash_filesize_route = + warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/filesize.min.js")) + }); - let dash_script_route = warp::filters::method::get() - .and(warp::path!("script.js")) - .map(|| Response::builder() + let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| { + Response::builder() .header("content-type", "application/javascript; charset=utf-8") .body(include_str!("../public/script.js")) - ); + }); - let updates_script_route = warp::filters::method::get() - .and(warp::path!("updates-script.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/updates-script.js")) - ); + let updates_script_route = + warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/updates-script.js")) + }); - let dash_logo_white_route = warp::filters::method::get() - .and(warp::path!("logo-white.svg")) - .map(|| Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-white.svg")) - ); + let dash_logo_white_route = + warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| { + Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../public/logo-white.svg")) + }); - let dash_logo_black_route = warp::filters::method::get() - .and(warp::path!("logo-black.svg")) - .map(|| Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-black.svg")) - ); + let dash_logo_black_route = + warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| { + Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../public/logo-black.svg")) + }); #[derive(Debug, Deserialize)] #[serde(untagged)] @@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> { search.filter(condition); } - let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = + search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); + highlighter.highlight_record( + &mut object, + &matching_words, + &attributes_to_highlight, + ); } documents.push(object); } - let answer = Answer { - documents, - number_of_candidates, - facets: facets.unwrap_or_default(), - }; + let answer = + Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() }; Response::builder() .header("Content-Type", "application/json") @@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> { }); let index_cloned = index.clone(); - let document_route = warp::filters::method::get() - .and(warp::path!("document" / String)) - .map(move |id: String| { + let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map( + move |id: String| { let index = index_cloned.clone(); let rtxn = index.read_txn().unwrap(); @@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> { match external_documents_ids.get(&id) { Some(document_id) => { let document_id = document_id as u32; - let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); + let (_, obkv) = + index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); Response::builder() .header("Content-Type", "application/json") .body(serde_json::to_string(&document).unwrap()) } - None => { - Response::builder() - .status(404) - .body(format!("Document with id {:?} not found.", id)) - } + None => Response::builder() + .status(404) + .body(format!("Document with id {:?} not found.", id)), } - }); + }, + ); async fn buf_stream( update_store: Arc>, - update_status_sender: broadcast::Sender>, + update_status_sender: broadcast::Sender< + UpdateStatus, + >, update_method: Option, update_format: UpdateFormat, encoding: Option, - mut stream: impl futures::Stream> + Unpin, - ) -> Result - { + mut stream: impl futures::Stream> + Unpin, + ) -> Result { let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); let mut file = TFile::from_std(file); @@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> { let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); - let clearing_route = warp::filters::method::post() - .and(warp::path!("clear-documents")) - .map(move || { + let clearing_route = + warp::filters::method::post().and(warp::path!("clear-documents")).map(move || { let meta = UpdateMeta::ClearDocuments; let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); @@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> { let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); - let abort_pending_updates_route = warp::filters::method::delete() - .and(warp::path!("updates")) - .map(move || { + let abort_pending_updates_route = + warp::filters::method::delete().and(warp::path!("updates")).map(move || { let updates = update_store_cloned.abort_pendings().unwrap(); for (update_id, meta) in updates { let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); @@ -930,25 +946,22 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); - let update_ws_route = warp::ws() - .and(warp::path!("updates" / "ws")) - .map(move |ws: warp::ws::Ws| { + let update_ws_route = + warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| { // And then our closure will be called when it completes... let update_status_receiver = update_status_sender.subscribe(); ws.on_upgrade(|websocket| { // Just echo all updates messages... update_status_receiver .into_stream() - .flat_map(|result| { - match result { - Ok(status) => { - let msg = serde_json::to_string(&status).unwrap(); - stream::iter(Some(Ok(Message::text(msg)))) - } - Err(e) => { - eprintln!("channel error: {:?}", e); - stream::iter(None) - } + .flat_map(|result| match result { + Ok(status) => { + let msg = serde_json::to_string(&status).unwrap(); + stream::iter(Some(Ok(Message::text(msg)))) + } + Err(e) => { + eprintln!("channel error: {:?}", e); + stream::iter(None) } }) .forward(websocket) @@ -988,10 +1001,9 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] mod tests { - use maplit::{btreeset,hashmap, hashset}; - use serde_test::{assert_tokens, Token}; - + use maplit::{btreeset, hashmap, hashset}; use milli::update::Setting; + use serde_test::{assert_tokens, Token}; use crate::Settings; @@ -1000,50 +1012,53 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), - filterable_attributes: Setting::Set(hashset!{ "age".to_string() }), + filterable_attributes: Setting::Set(hashset! { "age".to_string() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) + synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 6 }, - Token::Str("displayedAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("name"), - Token::SeqEnd, - Token::Str("searchableAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age"), - Token::SeqEnd, - Token::Str("facetedAttributes"), - Token::Some, - Token::Map { len: Some(1) }, - Token::Str("age"), - Token::Str("integer"), - Token::MapEnd, - Token::Str("criteria"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("asc(age)"), - Token::SeqEnd, - Token::Str("stopWords"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("and"), - Token::SeqEnd, - Token::Str("synonyms"), - Token::Some, - Token::Map { len: Some(1) }, - Token::Str("alex"), - Token::Seq {len: Some(1) }, - Token::Str("alexey"), - Token::SeqEnd, - Token::MapEnd, - Token::StructEnd, - ]); + assert_tokens( + &settings, + &[ + Token::Struct { name: "Settings", len: 6 }, + Token::Str("displayedAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("name"), + Token::SeqEnd, + Token::Str("searchableAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("age"), + Token::SeqEnd, + Token::Str("facetedAttributes"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("age"), + Token::Str("integer"), + Token::MapEnd, + Token::Str("criteria"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("asc(age)"), + Token::SeqEnd, + Token::Str("stopWords"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("and"), + Token::SeqEnd, + Token::Str("synonyms"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("alex"), + Token::Seq { len: Some(1) }, + Token::Str("alexey"), + Token::SeqEnd, + Token::MapEnd, + Token::StructEnd, + ], + ); } #[test] @@ -1057,22 +1072,25 @@ mod tests { synonyms: Setting::Reset, }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 6 }, - Token::Str("displayedAttributes"), - Token::None, - Token::Str("searchableAttributes"), - Token::None, - Token::Str("facetedAttributes"), - Token::None, - Token::Str("criteria"), - Token::None, - Token::Str("stopWords"), - Token::None, - Token::Str("synonyms"), - Token::None, - Token::StructEnd, - ]); + assert_tokens( + &settings, + &[ + Token::Struct { name: "Settings", len: 6 }, + Token::Str("displayedAttributes"), + Token::None, + Token::Str("searchableAttributes"), + Token::None, + Token::Str("facetedAttributes"), + Token::None, + Token::Str("criteria"), + Token::None, + Token::Str("stopWords"), + Token::None, + Token::Str("synonyms"), + Token::None, + Token::StructEnd, + ], + ); } #[test] @@ -1086,9 +1104,6 @@ mod tests { synonyms: Setting::NotSet, }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 0 }, - Token::StructEnd, - ]); + assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]); } } diff --git a/http-ui/src/update_store.rs b/http-ui/src/update_store.rs index 122ee6031..b77057fda 100644 --- a/http-ui/src/update_store.rs +++ b/http-ui/src/update_store.rs @@ -4,9 +4,9 @@ use std::path::Path; use std::sync::Arc; use crossbeam_channel::Sender; -use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice}; -use heed::{EnvOpenOptions, Env, Database}; -use serde::{Serialize, Deserialize}; +use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; +use heed::{Database, Env, EnvOpenOptions}; +use serde::{Deserialize, Serialize}; pub type BEU64 = heed::zerocopy::U64; @@ -25,7 +25,9 @@ pub trait UpdateHandler { } impl UpdateHandler for F -where F: FnMut(u64, M, &[u8]) -> heed::Result + Send + 'static { +where + F: FnMut(u64, M, &[u8]) -> heed::Result + Send + 'static, +{ fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result { self(update_id, meta, content) } @@ -82,26 +84,17 @@ impl UpdateStore { /// Returns the new biggest id to use to store the new update. fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result { - let last_pending = self.pending_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_pending = + self.pending_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_processed = self.processed_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_processed = + self.processed_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_aborted = self.aborted_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_aborted = + self.aborted_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_update_id = [last_pending, last_processed, last_aborted] - .iter() - .copied() - .flatten() - .max(); + let last_update_id = + [last_pending, last_processed, last_aborted].iter().copied().flatten().max(); match last_update_id { Some(last_id) => Ok(last_id + 1), @@ -112,7 +105,8 @@ impl UpdateStore { /// Registers the update content in the pending store and the meta /// into the pending-meta store. Returns the new unique update id. pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result - where M: Serialize, + where + M: Serialize, { let mut wtxn = self.env.write_txn()?; @@ -152,9 +146,8 @@ impl UpdateStore { // a reader while processing it, not a writer. match first_meta { Some((first_id, first_meta)) => { - let first_content = self.pending - .get(&rtxn, &first_id)? - .expect("associated update content"); + let first_content = + self.pending.get(&rtxn, &first_id)?.expect("associated update content"); // Process the pending update using the provided user function. let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; @@ -170,15 +163,16 @@ impl UpdateStore { wtxn.commit()?; Ok(Some((first_id.get(), new_meta))) - }, - None => Ok(None) + } + None => Ok(None), } } /// The id and metadata of the update that is currently being processed, /// `None` if no update is being processed. pub fn processing_update(&self) -> heed::Result> - where M: for<'a> Deserialize<'a>, + where + M: for<'a> Deserialize<'a>, { let rtxn = self.env.read_txn()?; match self.pending_meta.first(&rtxn)? { @@ -242,7 +236,8 @@ impl UpdateStore { /// that as already been processed or which doesn't actually exist, will /// return `None`. pub fn abort_update(&self, update_id: u64) -> heed::Result> - where M: Serialize + for<'a> Deserialize<'a>, + where + M: Serialize + for<'a> Deserialize<'a>, { let mut wtxn = self.env.write_txn()?; let key = BEU64::new(update_id); @@ -269,7 +264,8 @@ impl UpdateStore { /// Aborts all the pending updates, and not the one being currently processed. /// Returns the update metas and ids that were successfully aborted. pub fn abort_pendings(&self) -> heed::Result> - where M: Serialize + for<'a> Deserialize<'a>, + where + M: Serialize + for<'a> Deserialize<'a>, { let mut wtxn = self.env.write_txn()?; let mut aborted_updates = Vec::new(); @@ -303,17 +299,19 @@ pub enum UpdateStatusMeta { #[cfg(test)] mod tests { - use super::*; use std::thread; use std::time::{Duration, Instant}; + use super::*; + #[test] fn simple() { let dir = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { + let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { Ok(meta + " processed") - }).unwrap(); + }) + .unwrap(); let meta = String::from("kiki"); let update_id = update_store.register_update(&meta, &[]).unwrap(); @@ -329,10 +327,11 @@ mod tests { fn long_running_update() { let dir = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { + let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { thread::sleep(Duration::from_millis(400)); Ok(meta + " processed") - }).unwrap(); + }) + .unwrap(); let before_register = Instant::now(); diff --git a/infos/src/main.rs b/infos/src/main.rs index b0c304de0..151e8c664 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -1,16 +1,14 @@ use std::fmt::Write as _; use std::path::PathBuf; -use std::{str, io, fmt}; +use std::{fmt, io, str}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use structopt::StructOpt; - use milli::facet::FacetType; use milli::index::db_name::*; use milli::{Index, TreeLevel}; - +use structopt::StructOpt; use Command::*; #[cfg(target_os = "linux")] @@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> { WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), WordsPrefixesDocids { full_display, prefixes } => { words_prefixes_docids(&index, &rtxn, !full_display, prefixes) - }, + } FacetNumbersDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) - }, + } FacetStringsDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) - }, + } WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) - }, + } WordPrefixesLevelPositionsDocids { full_display, prefixes } => { word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) - }, + } FieldIdWordCountDocids { full_display, field_name } => { field_id_word_count_docids(&index, &rtxn, !full_display, field_name) - }, + } DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) - }, + } FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), - AverageNumberOfPositionsByWord => { - average_number_of_positions_by_word(&index, &rtxn) - }, + AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn), SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), DatabaseStats { database } => database_stats(&index, &rtxn, &database), WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) - }, + } ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments { internal_documents_ids } => { export_documents(&index, &rtxn, internal_documents_ids) - }, + } } } fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { - use std::collections::BinaryHeap; use std::cmp::Reverse; + use std::collections::BinaryHeap; let mut heap = BinaryHeap::with_capacity(limit + 1); for result in index.word_docids.iter(rtxn)? { - if limit == 0 { break } + if limit == 0 { + break; + } let (word, docids) = result?; heap.push((Reverse(docids.len()), word)); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } let stdout = io::stdout(); @@ -323,7 +323,7 @@ fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, db: heed::Database, field_id: u8, -) -> heed::Result> + 'txn>> +) -> heed::Result> + 'txn>> where KC: heed::BytesDecode<'txn>, DC: heed::BytesDecode<'txn>, @@ -347,7 +347,8 @@ fn facet_number_value_to_string(level: u8, left: T, right: T) -> fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::cmp::Reverse; use std::collections::BinaryHeap; - use heed::types::{Str, ByteSlice}; + + use heed::types::{ByteSlice, Str}; let Index { env: _env, @@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let words_fst = index.words_fst(rtxn)?; let length = words_fst.as_fst().as_bytes().len(); heap.push(Reverse((length, format!("words-fst"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } // Fetch the word prefix FST let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; let length = words_prefixes_fst.as_fst().as_bytes().len(); heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); heap.push(Reverse((value.len(), key, docid_word_positions_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_pair_proximity_docids.remap_data_type::().iter(rtxn)? { let ((word1, word2, prox), value) = result?; let key = format!("{} {} {}", word1, word2, prox); heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_pair_proximity_docids.remap_data_type::().iter(rtxn)? { let ((word, prefix, prox), value) = result?; let key = format!("{} {} {}", word, prefix, prox); heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { let ((word, level, left, right), value) = result?; let key = format!("{} {} {:?}", word, level, left..=right); heap.push(Reverse((value.len(), key, word_level_position_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { let ((word, level, left, right), value) = result?; let key = format!("{} {} {:?}", word, level, left..=right); heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { let ((field_id, word_count), docids) = result?; let key = format!("{} {}", field_id, word_count); heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } let faceted_fields = index.faceted_fields_ids(rtxn)?; @@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho write!(&mut output, " (level {})", level)?; let key = format!("{} {}", facet_name, output); heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } // List the facet strings of this facet id. @@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let ((_fid, fvalue), value) = result?; let key = format!("{} {}", facet_name, fvalue); heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } } for result in documents.remap_data_type::().iter(rtxn)? { let (id, value) = result?; heap.push(Reverse((value.len(), id.to_string(), documents_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } } @@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho Ok(wtr.flush()?) } -fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec) -> anyhow::Result<()> { +fn words_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + words: Vec, +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["word", "documents_ids"])?; @@ -523,8 +557,7 @@ fn words_prefixes_docids( rtxn: &heed::RoTxn, debug: bool, prefixes: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["prefix", "documents_ids"])?; @@ -561,12 +594,12 @@ fn facet_values_docids( debug: bool, facet_type: FacetType, field_name: String, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; - let field_id = fields_ids_map.id(&field_name) + let field_id = fields_ids_map + .id(&field_name) .with_context(|| format!("field {} not found", field_name))?; if !faceted_fields.contains(&field_id) { @@ -590,7 +623,7 @@ fn facet_values_docids( }; wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; } - }, + } FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { @@ -614,8 +647,7 @@ fn words_level_positions_docids( rtxn: &heed::RoTxn, debug: bool, words: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; @@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids( rtxn: &heed::RoTxn, debug: bool, prefixes: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; @@ -691,21 +722,20 @@ fn field_id_word_count_docids( index: &Index, rtxn: &heed::RoTxn, debug: bool, - field_name: String -) -> anyhow::Result<()> -{ + field_name: String, +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["field_name", "word_count", "docids"])?; - let field_id = index.fields_ids_map(rtxn)? + let field_id = index + .fields_ids_map(rtxn)? .id(&field_name) .with_context(|| format!("unknown field name: {}", &field_name))?; let left = (field_id, 0); let right = (field_id, u8::max_value()); - let iter = index.field_id_word_count_docids - .range(rtxn, &(left..=right))?; + let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?; for result in iter { let ((_, word_count), docids) = result?; @@ -725,8 +755,7 @@ fn docids_words_positions( rtxn: &heed::RoTxn, debug: bool, internal_ids: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["document_id", "word", "positions"])?; @@ -734,9 +763,10 @@ fn docids_words_positions( let iter: Box> = if internal_ids.is_empty() { Box::new(index.docid_word_positions.iter(rtxn)?) } else { - let vec: heed::Result> = internal_ids.into_iter().map(|id| { - index.docid_word_positions.prefix_iter(rtxn, &(id, "")) - }).collect(); + let vec: heed::Result> = internal_ids + .into_iter() + .map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, ""))) + .collect(); Box::new(vec?.into_iter().flatten()) }; @@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; - let field_id = fields_ids_map.id(&field_name) + let field_id = fields_ids_map + .id(&field_name) .with_context(|| format!("field {} not found", field_name))?; if !faceted_fields.contains(&field_id) { @@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result< Ok(()) } -fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) -> anyhow::Result<()> { +fn export_documents( + index: &Index, + rtxn: &heed::RoTxn, + internal_ids: Vec, +) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use milli::{BEU32, obkv_to_json}; + + use milli::{obkv_to_json, BEU32}; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); let iter: Box> = if internal_ids.is_empty() { - Box::new(index.documents.iter(rtxn)?.map(|result| { - result.map(|(_id, obkv)| obkv) - })) + Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) } else { - Box::new(internal_ids.into_iter().flat_map(|id| { - index.documents.get(rtxn, &BEU32::new(id)).transpose() - })) + Box::new( + internal_ids + .into_iter() + .flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()), + ) }; for result in iter { @@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use milli::{DocumentId, BEU32StrCodec}; + use milli::{BEU32StrCodec, DocumentId}; let mut words_counts = Vec::new(); let mut count = 0; let mut prev = None as Option<(DocumentId, u32)>; - let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; + let iter = + index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; for result in iter { let ((docid, _word), ()) = result?; match prev.as_mut() { Some((prev_docid, prev_count)) if docid == *prev_docid => { *prev_count += 1; - }, + } Some((prev_docid, prev_count)) => { words_counts.push(*prev_count); *prev_docid = docid; *prev_count = 0; count += 1; - }, + } None => prev = Some((docid, 1)), } } @@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { use heed::types::ByteSlice; - use heed::{Error, BytesDecode}; - use roaring::RoaringBitmap; + use heed::{BytesDecode, Error}; use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; + use roaring::RoaringBitmap; fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( db: heed::PolyDatabase, rtxn: &'a heed::RoTxn, name: &str, - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { let mut key_size = 0u64; let mut val_size = 0u64; let mut values_length = Vec::new(); @@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu WORD_DOCIDS => { let db = index.word_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PREFIX_DOCIDS => { let db = index.word_prefix_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } DOCID_WORD_POSITIONS => { let db = index.docid_word_positions.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PAIR_PROXIMITY_DOCIDS => { let db = index.word_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } FIELD_ID_WORD_COUNT_DOCIDS => { let db = index.field_id_word_count_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } unknown => anyhow::bail!("unknown database {:?}", unknown), } } @@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids( debug: bool, word1: String, word2: String, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { use heed::types::ByteSlice; use milli::RoaringBitmapCodec; @@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids( // Skip keys that are longer than the requested one, // a longer key means that the second word is a prefix of the request word. - if key.len() != prefix.len() + 1 { continue; } + if key.len() != prefix.len() + 1 { + continue; + } let proximity = key.last().unwrap(); let docids = if debug { diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 931cf8588..cc1fca01f 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,15 +1,14 @@ use std::fmt; use std::str::FromStr; -use regex::Regex; -use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; +use regex::Regex; +use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; -static ASC_DESC_REGEX: Lazy = Lazy::new(|| { - Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() -}); +static ASC_DESC_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -52,17 +51,21 @@ impl FromStr for Criterion { "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), text => { - let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { - UserError::InvalidCriterionName { name: text.to_string() } - })?; + let caps = ASC_DESC_REGEX + .captures(text) + .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), - text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), + text => { + return Err( + UserError::InvalidCriterionName { name: text.to_string() }.into() + ) + } } - }, + } } } } @@ -82,13 +85,13 @@ impl fmt::Display for Criterion { use Criterion::*; match self { - Words => f.write_str("words"), - Typo => f.write_str("typo"), - Proximity => f.write_str("proximity"), - Attribute => f.write_str("attribute"), - Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "asc({})", attr), - Desc(attr) => write!(f, "desc({})", attr), + Words => f.write_str("words"), + Typo => f.write_str("typo"), + Proximity => f.write_str("proximity"), + Attribute => f.write_str("attribute"), + Exactness => f.write_str("exactness"), + Asc(attr) => write!(f, "asc({})", attr), + Desc(attr) => write!(f, "desc({})", attr), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 78a1b1c59..31012c690 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -2,7 +2,7 @@ use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; -use heed::{MdbError, Error as HeedError}; +use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; @@ -80,14 +80,17 @@ impl From for Error { } } -impl From> for Error where Error: From { +impl From> for Error +where + Error: From, +{ fn from(error: grenad::Error) -> Error { match error { grenad::Error::Io(error) => Error::IoError(error), grenad::Error::Merge(error) => Error::from(error), grenad::Error::InvalidCompressionType => { Error::InternalError(InternalError::GrenadInvalidCompressionType) - }, + } } } } @@ -171,15 +174,15 @@ impl fmt::Display for InternalError { match self { Self::DatabaseMissingEntry { db_name, key } => { write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) - }, + } Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { f.write_str("invalid compression type have been specified to grenad") - }, + } Self::IndexingMergingKeys { process } => { write!(f, "invalid merge while processing {}", process) - }, + } Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), Self::RayonThreadPool(error) => error.fmt(f), @@ -204,12 +207,12 @@ impl fmt::Display for UserError { Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!(f, "document identifier is invalid {}", json) - }, + } Self::InvalidFilterAttribute(error) => error.fmt(f), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) - }, + } Self::MissingPrimaryKey => f.write_str("missing primary key"), Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), // TODO where can we find it instead of writing the text ourselves? @@ -217,14 +220,14 @@ impl fmt::Display for UserError { Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), Self::PrimaryKeyCannotBeChanged => { f.write_str("primary key cannot be changed if the database contains documents") - }, + } Self::PrimaryKeyCannotBeReset => { f.write_str("primary key cannot be reset if the database contains documents") - }, + } Self::SerdeJson(error) => error.fmt(f), Self::UnknownInternalDocumentId { document_id } => { write!(f, "an unknown internal document id have been used ({})", document_id) - }, + } } } } @@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry { match self { Self::FieldId { field_id, process } => { write!(f, "unknown field id {} coming from the {} process", field_id, process) - }, + } Self::FieldName { field_name, process } => { write!(f, "unknown field name {} coming from the {} process", field_name, process) - }, + } } } } @@ -251,11 +254,11 @@ impl fmt::Display for SerializationError { match self { Self::Decoding { db_name: Some(name) } => { write!(f, "decoding from the {} database failed", name) - }, + } Self::Decoding { db_name: None } => f.write_str("decoding failed"), Self::Encoding { db_name: Some(name) } => { write!(f, "encoding into the {} database failed", name) - }, + } Self::Encoding { db_name: None } => f.write_str("encoding failed"), Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), } diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index ee2a6c7bb..3dec638da 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::convert::TryInto; -use fst::{Streamer, IntoStreamer}; + +use fst::{IntoStreamer, Streamer}; pub struct ExternalDocumentsIds<'a> { pub(crate) hard: fst::Map>, @@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> { } impl<'a> ExternalDocumentsIds<'a> { - pub fn new(hard: fst::Map>, soft: fst::Map>) -> ExternalDocumentsIds<'a> { + pub fn new( + hard: fst::Map>, + soft: fst::Map>, + ) -> ExternalDocumentsIds<'a> { ExternalDocumentsIds { hard, soft } } @@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> { match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { // u64 MAX means deleted in the soft fst map Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), - _otherwise => None + _otherwise => None, } } diff --git a/milli/src/facet/facet_type.rs b/milli/src/facet/facet_type.rs index 09f29bc00..51dd448e2 100644 --- a/milli/src/facet/facet_type.rs +++ b/milli/src/facet/facet_type.rs @@ -2,10 +2,9 @@ use std::error::Error; use std::fmt; use std::str::FromStr; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum FacetType { String, Number, @@ -43,4 +42,4 @@ impl fmt::Display for InvalidFacetType { } } -impl Error for InvalidFacetType { } +impl Error for InvalidFacetType {} diff --git a/milli/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs index 99455fa27..eb7fb3c5e 100644 --- a/milli/src/facet/facet_value.rs +++ b/milli/src/facet/facet_value.rs @@ -50,7 +50,7 @@ impl Serialize for FacetValue { FacetValue::Number(number) => { let string = number.to_string(); serializer.serialize_str(&string) - }, + } } } } diff --git a/milli/src/facet/value_encoding.rs b/milli/src/facet/value_encoding.rs index 7259243e5..31c00bd2d 100644 --- a/milli/src/facet/value_encoding.rs +++ b/milli/src/facet/value_encoding.rs @@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { #[cfg(test)] mod tests { use std::cmp::Ordering::Less; + use super::*; fn is_sorted(x: &[T]) -> bool { @@ -39,8 +40,8 @@ mod tests { let a = -13_f64; let b = -10.0; let c = -0.0; - let d = 1.0; - let e = 43.0; + let d = 1.0; + let e = 43.0; let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); assert!(is_sorted(&vec), "{:?}", vec); diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 76ff2d281..b0a084c3c 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -1,5 +1,7 @@ use std::collections::BTreeMap; -use serde::{Serialize, Deserialize}; + +use serde::{Deserialize, Serialize}; + use crate::FieldId; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -11,11 +13,7 @@ pub struct FieldsIdsMap { impl FieldsIdsMap { pub fn new() -> FieldsIdsMap { - FieldsIdsMap { - names_ids: BTreeMap::new(), - ids_names: BTreeMap::new(), - next_id: Some(0), - } + FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) } } /// Returns the number of fields ids in the map. @@ -62,17 +60,17 @@ impl FieldsIdsMap { } /// Iterate over the ids and names in the ids order. - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> impl Iterator { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } /// Iterate over the ids in the order of the ids. - pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { self.ids_names.keys().copied() } /// Iterate over the names in the order of the ids. - pub fn names(&self) -> impl Iterator { + pub fn names(&self) -> impl Iterator { self.ids_names.values().map(AsRef::as_ref) } } diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs index a4642f961..b23dcb269 100644 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { #[cfg(test)] mod tests { - use heed::{BytesEncode, BytesDecode}; + use heed::{BytesDecode, BytesEncode}; + use super::*; #[test] diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs index e9b5abeb8..b3c0fa381 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; use std::convert::TryInto; -use crate::{FieldId, DocumentId}; use crate::facet::value_encoding::f64_into_bytes; +use crate::{DocumentId, FieldId}; pub struct FieldDocIdFacetF64Codec; diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index b002346e9..fd3f1143d 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -2,12 +2,17 @@ use std::borrow::Cow; use std::convert::TryInto; use std::str; -use crate::{FieldId, DocumentId}; +use crate::{DocumentId, FieldId}; pub struct FieldDocIdFacetStringCodec; impl FieldDocIdFacetStringCodec { - pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec) { + pub fn serialize_into( + field_id: FieldId, + document_id: DocumentId, + value: &str, + out: &mut Vec, + ) { out.reserve(1 + 4 + value.len()); out.push(field_id); out.extend_from_slice(&document_id.to_be_bytes()); diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs index 5796e5020..64f0e1db6 100644 --- a/milli/src/heed_codec/field_id_word_count_codec.rs +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, convert::TryInto}; +use std::borrow::Cow; +use std::convert::TryInto; use crate::FieldId; diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 65a06573e..7bd7dff2d 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,16 +1,18 @@ mod beu32_str_codec; +pub mod facet; +mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; mod str_level_position_codec; mod str_str_u8_codec; -mod field_id_word_count_codec; -pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; +pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; -pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::roaring_bitmap_length::{ + BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, +}; pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; -pub use self::field_id_word_count_codec::FieldIdWordCountCodec; diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index 94a230e05..b7414b693 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; + use obkv::{KvReader, KvWriter}; pub struct ObkvCodec; diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 325effa73..53f64d648 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { #[cfg(test)] mod tests { use std::iter::FromIterator; - use heed::{BytesEncode, BytesDecode}; + + use heed::{BytesDecode, BytesEncode}; + use super::*; #[test] diff --git a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs index 755296704..8fae9b8fd 100644 --- a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; + use roaring::RoaringBitmap; pub struct RoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs index 042b5cf6b..4d266e413 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -1,7 +1,7 @@ -use std::io::{self, Read, BufRead}; +use std::io::{self, BufRead, Read}; use std::mem; -use byteorder::{ReadBytesExt, LittleEndian}; +use byteorder::{LittleEndian, ReadBytesExt}; const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u16 = 12347; @@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec { if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { (bytes.read_u32::()? as usize, true) } else if (cookie as u16) == SERIAL_COOKIE { - return Err(io::Error::new( - io::ErrorKind::Other, - "run containers are unsupported", - )); + return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); } else { return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); } }; if size > u16::max_value() as usize + 1 { - return Err(io::Error::new( - io::ErrorKind::Other, - "size is greater than supported", - )); + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); } let mut description_bytes = vec![0u8; size * 4]; @@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { #[cfg(test)] mod tests { - use super::*; - - use crate::heed_codec::RoaringBitmapCodec; use heed::BytesEncode; use roaring::RoaringBitmap; + use super::*; + use crate::heed_codec::RoaringBitmapCodec; + #[test] fn deserialize_roaring_bitmap_length() { let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs index 810e91940..5be45bbeb 100644 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::() + size_of::() * 2; - if bytes.len() < footer_len { return None } + if bytes.len() < footer_len { + return None; + } let (word, bytes) = bytes.split_at(bytes.len() - footer_len); let word = str::from_utf8(word).ok()?; diff --git a/milli/src/index.rs b/milli/src/index.rs index bf4b3e023..a6c09f3d3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use chrono::{DateTime, Utc}; -use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; +use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; -use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; -use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; -use crate::{ - BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, - FieldIdWordCountCodec, -}; -use crate::heed_codec::facet::{ - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - FacetValueStringCodec, FacetLevelValueF64Codec, -}; +use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, +}; +use crate::{ + default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, + DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec, + FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, + StrLevelPositionCodec, StrStrU8Codec, BEU32, +}; pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; @@ -114,14 +113,17 @@ impl Index { let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; - let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let word_prefix_pair_proximity_docids = + env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; - let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let word_prefix_level_position_docids = + env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; - let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let field_id_docid_facet_strings = + env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; let documents = env.create_database(Some(DOCUMENTS))?; Index::initialize_creation_dates(&env, main)?; @@ -184,18 +186,26 @@ impl Index { /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. - pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { + pub(crate) fn put_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } /// Returns the internal documents ids. pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) } /// Returns the number of documents indexed in the database. pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { - let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; + let count = + self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; Ok(count.unwrap_or_default()) } @@ -224,21 +234,30 @@ impl Index { &self, wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let ExternalDocumentsIds { hard, soft } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, + hard, + )?; + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, + soft, + )?; Ok(()) } /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let hard = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard = match hard { Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, @@ -254,42 +273,62 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. - pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + pub(crate) fn put_fields_ids_map( + &self, + wtxn: &mut RwTxn, + map: &FieldsIdsMap, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } /// Returns the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>( - rtxn, - main_key::FIELDS_IDS_MAP_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_IDS_MAP_KEY)? + .unwrap_or_default()) } /* fields distribution */ /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. - pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) + pub(crate) fn put_fields_distribution( + &self, + wtxn: &mut RwTxn, + distribution: &FieldsDistribution, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>( + wtxn, + main_key::FIELDS_DISTRIBUTION_KEY, + distribution, + ) } /// Returns the fields distribution which associates every field name with /// the number of times it occurs in the documents. pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>( - rtxn, - main_key::FIELDS_DISTRIBUTION_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? + .unwrap_or_default()) } /* displayed fields */ /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) + pub(crate) fn put_displayed_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::DISPLAYED_FIELDS_KEY, + &fields, + ) } /// Deletes the displayed fields ids, this will make the engine to display @@ -313,14 +352,17 @@ impl Index { for name in fields.into_iter() { match fields_ids_map.id(name) { Some(field_id) => fields_ids.push(field_id), - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::displayed_fields_ids", - }.into()), + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::displayed_fields_ids", + } + .into()) + } } } Ok(Some(fields_ids)) - }, + } None => Ok(None), } } @@ -328,8 +370,16 @@ impl Index { /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) + pub(crate) fn put_searchable_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::SEARCHABLE_FIELDS_KEY, + &fields, + ) } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. @@ -352,14 +402,17 @@ impl Index { for name in fields { match fields_ids_map.id(name) { Some(field_id) => fields_ids.push(field_id), - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::searchable_fields_ids", - }.into()), + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::searchable_fields_ids", + } + .into()) + } } } Ok(Some(fields_ids)) - }, + } None => Ok(None), } } @@ -367,7 +420,11 @@ impl Index { /* filterable fields */ /// Writes the filterable fields names in the database. - pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + pub(crate) fn put_filterable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } @@ -378,10 +435,10 @@ impl Index { /// Returns the filterable fields names. pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self.main.get::<_, Str, SerdeJson<_>>( - rtxn, - main_key::FILTERABLE_FIELDS_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)? + .unwrap_or_default()) } /// Identical to `filterable_fields`, but returns ids instead. @@ -394,11 +451,14 @@ impl Index { match fields_ids_map.id(&name) { Some(field_id) => { fields_ids.insert(field_id); - }, - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::filterable_fields_ids", - }.into()), + } + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::filterable_fields_ids", + } + .into()) + } } } @@ -413,9 +473,8 @@ impl Index { pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; - let asc_desc_fields = self.criteria(rtxn)? - .into_iter() - .filter_map(|criterion| match criterion { + let asc_desc_fields = + self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { Criterion::Asc(field) | Criterion::Desc(field) => Some(field), _otherwise => None, }); @@ -439,11 +498,14 @@ impl Index { match fields_ids_map.id(&name) { Some(field_id) => { fields_ids.insert(field_id); - }, - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::faceted_fields_ids", - }.into()), + } + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::faceted_fields_ids", + } + .into()) + } } } @@ -458,8 +520,7 @@ impl Index { wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -472,8 +533,7 @@ impl Index { &self, rtxn: &RoTxn, field_id: FieldId, - ) -> heed::Result - { + ) -> heed::Result { let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -490,8 +550,7 @@ impl Index { wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -504,8 +563,7 @@ impl Index { &self, rtxn: &RoTxn, field_id: FieldId, - ) -> heed::Result - { + ) -> heed::Result { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -518,7 +576,11 @@ impl Index { /* distinct field */ - pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { + pub(crate) fn put_distinct_field( + &self, + wtxn: &mut RwTxn, + distinct_field: &str, + ) -> heed::Result<()> { self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) } @@ -532,7 +594,11 @@ impl Index { /* criteria */ - pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + pub(crate) fn put_criteria( + &self, + wtxn: &mut RwTxn, + criteria: &[Criterion], + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } @@ -550,7 +616,11 @@ impl Index { /* words fst */ /// Writes the FST which is the words dictionary of the engine. - pub(crate) fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } @@ -564,7 +634,11 @@ impl Index { /* stop words */ - pub(crate) fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_stop_words>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } @@ -585,8 +659,7 @@ impl Index { &self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) } @@ -595,15 +668,17 @@ impl Index { } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { - Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)? + .unwrap_or_default()) } pub fn words_synonyms>( &self, rtxn: &RoTxn, words: &[S], - ) -> heed::Result>>> - { + ) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms(rtxn)?.remove(&words)) } @@ -611,8 +686,16 @@ impl Index { /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. - pub(crate) fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) + pub(crate) fn put_words_prefixes_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::WORDS_PREFIXES_FST_KEY, + fst.as_fst().as_bytes(), + ) } /// Returns the FST which is the words prefixes dictionnary of the engine. @@ -637,13 +720,14 @@ impl Index { pub fn documents<'t>( &self, rtxn: &'t RoTxn, - ids: impl IntoIterator, - ) -> Result)>> - { + ids: impl IntoIterator, + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { - let kv = self.documents.get(rtxn, &BEU32::new(id))? + let kv = self + .documents + .get(rtxn, &BEU32::new(id))? .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -673,7 +757,8 @@ impl Index { /// Returns the index creation time. pub fn created_at(&self, rtxn: &RoTxn) -> Result> { - Ok(self.main + Ok(self + .main .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -683,7 +768,8 @@ impl Index { /// Returns the index last updated time. pub fn updated_at(&self, rtxn: &RoTxn) -> Result> { - Ok(self.main + Ok(self + .main .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -691,7 +777,11 @@ impl Index { })?) } - pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { + pub(crate) fn set_updated_at( + &self, + wtxn: &mut RwTxn, + time: &DateTime, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) } } @@ -704,8 +794,8 @@ pub(crate) mod tests { use maplit::hashmap; use tempfile::TempDir; - use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; + use crate::Index; pub(crate) struct TempIndex { inner: Index, @@ -728,10 +818,7 @@ pub(crate) mod tests { options.map_size(100 * 4096); let _tempdir = TempDir::new_in(".").unwrap(); let inner = Index::new(options, _tempdir.path()).unwrap(); - Self { - inner, - _tempdir - } + Self { inner, _tempdir } } } @@ -756,10 +843,13 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); - assert_eq!(fields_distribution, hashmap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - }); + assert_eq!( + fields_distribution, + hashmap! { + "id".to_string() => 2, + "name".to_string() => 2, + "age".to_string() => 1, + } + ); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f37244114..201035a8a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,14 +1,15 @@ -#[macro_use] extern crate pest_derive; +#[macro_use] +extern crate pest_derive; mod criterion; mod error; mod external_documents_ids; -mod fields_ids_map; -mod search; pub mod facet; +mod fields_ids_map; pub mod heed_codec; pub mod index; pub mod proximity; +mod search; pub mod tree_level; pub mod update; @@ -20,15 +21,17 @@ use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; -pub use self::criterion::{Criterion, default_criteria}; +pub use self::criterion::{default_criteria, Criterion}; pub use self::error::Error; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; -pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; +pub use self::heed_codec::{ + BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, + CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, + RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, +}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; +pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; pub use self::tree_level::TreeLevel; pub type Result = std::result::Result; @@ -54,9 +57,9 @@ pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, -) -> Result> -{ - displayed_fields.iter() +) -> Result> { + displayed_fields + .iter() .copied() .flat_map(|id| obkv.get(id).map(|value| (id, value))) .map(|(id, value)| { @@ -72,7 +75,6 @@ pub fn obkv_to_json( /// Transform a JSON value into a string that can be indexed. pub fn json_to_string(value: &Value) -> Option { - fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { @@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option { } // check that at least one value was written count != 0 - }, + } Value::Object(object) => { let mut buffer = String::new(); let mut count = 0; @@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option { } // check that at least one value was written count != 0 - }, + } } } @@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn json_to_string_object() { let value = json!({ diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 0186eb3d0..db98426a5 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,4 +1,5 @@ use std::cmp; + use crate::{Attribute, Position}; const ONE_ATTRIBUTE: u32 = 1000; @@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { let (lhs_attr, lhs_index) = extract_position(lhs); let (rhs_attr, rhs_index) = extract_position(rhs); - if lhs_attr != rhs_attr { MAX_DISTANCE } - else { index_proximity(lhs_index, rhs_index) } + if lhs_attr != rhs_attr { + MAX_DISTANCE + } else { + index_proximity(lhs_index, rhs_index) + } } pub fn extract_position(position: Position) -> (Attribute, Position) { diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 95f77fd78..ccee2c393 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -5,12 +5,12 @@ use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use super::{Criterion, CriterionParameters, CriterionResult}; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; -use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map - .id(&field_name) - .ok_or_else(|| FieldIdMapMissingEntry::FieldName { + let field_id = + fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: field_name.clone(), process: "AscDesc::new", })?; @@ -101,44 +100,47 @@ impl<'t> Criterion for AscDesc<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { - self.query_tree = query_tree; - let mut candidates = match (&self.query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - }, - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.query_tree = query_tree; + let mut candidates = match (&self.query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - if candidates.is_empty() { - continue; - } + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.ascending, - candidates & &self.faceted_candidates, - )?; - }, - None => return Ok(None), + if candidates.is_empty() { + continue; + } + + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.ascending, + candidates & &self.faceted_candidates, + )?; } + None => return Ok(None), }, Some(mut candidates) => { candidates -= params.excluded_candidates; @@ -170,11 +172,8 @@ fn facet_ordered<'t>( let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_fn = if ascending { - FacetIter::new_reducing - } else { - FacetIter::new_reverse_reducing - }; + let facet_fn = + if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } @@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>( for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); - let mut iter = index - .field_id_docid_facet_f64s - .range(rtxn, &(left..=right))?; + let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index f191defe1..6e0bb40d5 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,15 +1,16 @@ -use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; -use std::collections::{BTreeMap, HashMap, btree_map}; +use std::borrow::Cow; +use std::cmp::{self, Ordering}; use std::collections::binary_heap::PeekMut; +use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; use std::mem::take; use roaring::RoaringBitmap; -use crate::{TreeLevel, Result, search::build_dfa}; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree}; +use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; +use crate::{Result, TreeLevel}; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. @@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { - let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; + let new_buckets = linear_compute_candidates( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + )?; self.current_buckets.get_or_insert(new_buckets.into_iter()) - }, + } }; match current_buckets.next() { @@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } } else { - match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { + match set_compute_candidates( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + params.wdcache, + )? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { @@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } }; allowed_candidates -= &found_candidates; - self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); + self.state = + Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); return Ok(Some(CriterionResult { query_tree: Some(query_tree), @@ -109,39 +120,50 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - let flattened_query_tree = flatten_query_tree(&query_tree); + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + let flattened_query_tree = flatten_query_tree(&query_tree); - self.state = Some((query_tree, flattened_query_tree, candidates)); - self.current_buckets = None; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.current_buckets = None; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> { /// it will begin at the first non-empty interval and will return every interval without /// jumping over empty intervals. struct WordLevelIterator<'t, 'q> { - inner: Box> + 't>, + inner: Box< + dyn Iterator> + 't, + >, level: TreeLevel, interval_size: u32, word: Cow<'q, str>, @@ -162,49 +186,80 @@ struct WordLevelIterator<'t, 'q> { } impl<'t, 'q> WordLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { + fn new( + ctx: &'t dyn Context<'t>, + word: Cow<'q, str>, + in_prefix_cache: bool, + ) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { - Some(level) => { + Some(level) => { let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; - Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) - }, + let inner = + ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; + Ok(Some(Self { + inner, + level, + interval_size, + word, + in_prefix_cache, + inner_next: None, + current_interval: None, + })) + } None => Ok(None), } } - fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { + fn dig( + &self, + ctx: &'t dyn Context<'t>, + level: &TreeLevel, + left_interval: Option, + ) -> heed::Result { let level = *level.min(&self.level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; + let inner = + ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; - Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) + Ok(Self { + inner, + level, + interval_size, + word, + in_prefix_cache, + inner_next: None, + current_interval: None, + }) } fn next(&mut self) -> heed::Result> { - fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } + fn is_next_interval(last_right: u32, next_left: u32) -> bool { + last_right + 1 == next_left + } let inner_next = match self.inner_next.take() { Some(inner_next) => Some(inner_next), - None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), + None => self + .inner + .next() + .transpose()? + .map(|((_, _, left, right), docids)| (left, right, docids)), }; match inner_next { - Some((left, right, docids)) => { - match self.current_interval { - Some((last_left, last_right)) if !is_next_interval(last_right, left) => { - let blank_left = last_left + self.interval_size; - let blank_right = last_right + self.interval_size; - self.current_interval = Some((blank_left, blank_right)); - self.inner_next = Some((left, right, docids)); - Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) - }, - _ => { - self.current_interval = Some((left, right)); - Ok(Some((left, right, docids))) - } + Some((left, right, docids)) => match self.current_interval { + Some((last_left, last_right)) if !is_next_interval(last_right, left) => { + let blank_left = last_left + self.interval_size; + let blank_right = last_right + self.interval_size; + self.current_interval = Some((blank_left, blank_right)); + self.inner_next = Some((left, right, docids)); + Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) + } + _ => { + self.current_interval = Some((left, right)); + Ok(Some((left, right, docids))) } }, None => Ok(None), @@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { QueryKind::Exact { word, .. } => { if !query.prefix || ctx.in_prefix_cache(&word) { let word = Cow::Borrowed(query.kind.word()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { + if let Some(word_level_iterator) = + WordLevelIterator::new(ctx, word, query.prefix)? + { inner.push(word_level_iterator); } } else { - for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? + { let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + if let Some(word_level_iterator) = + WordLevelIterator::new(ctx, word, false)? + { inner.push(word_level_iterator); } } } - }, + } QueryKind::Tolerant { typo, word } => { - for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { + for (word, _) in + word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? + { let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? + { inner.push(word_level_iterator); } } @@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { Some(parent) => { let parent = parent.dig(ctx)?; (parent.level.min(self.level), Some(Box::new(parent))) - }, + } None => (self.level.saturating_sub(1), None), }; - let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); + let left_interval = self + .accumulator + .get(self.interval_to_skip) + .map(|opt| opt.as_ref().map(|(left, _, _)| *left)) + .flatten(); let mut inner = Vec::with_capacity(self.inner.len()); for word_level_iterator in self.inner.iter() { inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); } - Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) + Ok(Self { + parent, + inner, + level, + accumulator: vec![], + parent_accumulator: vec![], + interval_to_skip: 0, + }) } fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { @@ -305,12 +378,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let wli_u8_level = Into::::into(wli.level); let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { - if let Some((next_left, _, next_docids)) = wli.next()? { - accumulated = match accumulated.take(){ + if let Some((next_left, _, next_docids)) = wli.next()? { + accumulated = match accumulated.take() { Some((acc_left, acc_right, mut acc_docids)) => { acc_docids |= next_docids; Some((acc_left, acc_right, acc_docids)) - }, + } None => Some((next_left, next_left + interval_size, next_docids)), }; } @@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// return the next meta-interval created from inner WordLevelIterators, /// and from eventual chainned QueryLevelIterator. - fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { + fn next( + &mut self, + allowed_candidates: &RoaringBitmap, + tree_level: TreeLevel, + ) -> heed::Result> { let parent_result = match self.parent.as_mut() { Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), None => None, @@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { &self.parent_accumulator, &self.accumulator, self.interval_to_skip, - allowed_candidates + allowed_candidates, ); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; - for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { + for current in self + .accumulator + .iter() + .rev() + .zip(self.parent_accumulator.iter()) + .skip(self.interval_to_skip) + { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { match merged_interval.as_mut() { Some((_, _, merged_docids)) => *merged_docids |= a & b, - None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), + None => { + merged_interval = Some((left_a + left_b, right_a + right_b, a & b)) + } } } } Ok(merged_interval) - }, + } None => { let level = self.level; match self.inner_next(level)? { @@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; candidates &= allowed_candidates; Ok(Some((left, right, candidates))) - - }, + } None => { self.accumulator = vec![None]; Ok(None) - }, + } } } } @@ -379,16 +463,18 @@ fn interval_to_skip( already_skiped: usize, allowed_candidates: &RoaringBitmap, ) -> usize { - parent_accumulator.iter() + parent_accumulator + .iter() .zip(current_accumulator.iter()) .skip(already_skiped) .take_while(|(parent, current)| { let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); - let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); + let skip_current = current + .as_ref() + .map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); skip_parent && skip_current }) .count() - } /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, @@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> { self.last_result = last_result; self.tree_level = tree_level; Ok(true) - }, + } None => Ok(false), } } @@ -429,7 +515,7 @@ impl<'t, 'q> Branch<'t, 'q> { let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; - self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); } /// return the score of the current inner interval. @@ -477,31 +563,31 @@ fn initialize_query_level_iterators<'t, 'q>( allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> Result>> { - let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); - for queries in branch { + for queries in branch { match QueryLevelIterator::new(ctx, queries, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. branch_positions.clear(); break; - }, + } } } // QueryLevelIterator need to be sorted by level and folded in descending order. branch_positions.sort_unstable_by_key(|qli| qli.level); - let folded_query_level_iterators = branch_positions - .into_iter() - .fold(None, |fold: Option, mut qli| match fold { - Some(fold) => { - qli.parent(fold); - Some(qli) - }, - None => Some(qli), - }); + let folded_query_level_iterators = + branch_positions.into_iter().fold(None, |fold: Option, mut qli| { + match fold { + Some(fold) => { + qli.parent(fold); + Some(qli) + } + None => Some(qli), + } + }); if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let tree_level = folded_query_level_iterators.level; @@ -526,9 +612,9 @@ fn set_compute_candidates<'t>( branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> Result> -{ - let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; +) -> Result> { + let mut branches_heap = + initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); let mut final_candidates: Option<(u32, RoaringBitmap)> = None; let mut allowed_candidates = allowed_candidates.clone(); @@ -539,15 +625,18 @@ fn set_compute_candidates<'t>( // if current is worst than best we break to return // candidates that correspond to the best rank if let Some((best_rank, _)) = final_candidates { - if branch_rank > best_rank { break } + if branch_rank > best_rank { + break; + } } let _left = branch.last_result.0; let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { // we don't have candidates, get next interval. - if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } - } - else if is_lowest_level { + if !branch.next(&allowed_candidates)? { + PeekMut::pop(branch); + } + } else if is_lowest_level { // we have candidates, but we can't dig deeper. allowed_candidates -= &candidates; final_candidates = match final_candidates.take() { @@ -556,19 +645,20 @@ fn set_compute_candidates<'t>( best_candidates |= candidates; branch.lazy_next(); Some((best_rank, best_candidates)) - }, + } // we take current candidates as best candidates None => { branch.lazy_next(); Some((branch_rank, candidates)) - }, + } }; } else { // we have candidates, lets dig deeper in levels. branch.dig(ctx)?; - if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } + if !branch.next(&allowed_candidates)? { + PeekMut::pop(branch); + } } - } Ok(final_candidates.map(|(_rank, candidates)| candidates)) @@ -578,9 +668,11 @@ fn linear_compute_candidates( ctx: &dyn Context, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, -) -> Result> -{ - fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { +) -> Result> { + fn compute_candidate_rank( + branches: &FlattenedQueryTree, + words_positions: HashMap, + ) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { let branch_len = branch.len(); @@ -593,17 +685,20 @@ fn linear_compute_candidates( QueryKind::Exact { word, .. } => { if *prefix { word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() + .flat_map(|positions| positions.iter().next()) + .min() } else { - words_positions.get(word) + words_positions + .get(word) .map(|positions| positions.iter().next()) .flatten() } - }, + } QueryKind::Tolerant { typo, word } => { word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - }, + .flat_map(|positions| positions.iter().next()) + .min() + } }; match (position, current_position) { @@ -627,9 +722,11 @@ fn linear_compute_candidates( branch_rank.sort_unstable(); // because several words in same query can't match all a the position 0, // we substract the word index to the position. - let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + let branch_rank: u64 = + branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); // here we do the means of the words of the branch - min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); + min_rank = + min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); } } @@ -641,8 +738,7 @@ fn linear_compute_candidates( is_prefix: bool, max_typo: u8, words_positions: &'a HashMap, - ) -> impl Iterator - { + ) -> impl Iterator { let dfa = build_dfa(word, max_typo, is_prefix); words_positions.iter().filter_map(move |(document_word, positions)| { use levenshtein_automata::Distance; @@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { } } out - }, + } None => recurse(head), } } fn recurse(op: &Operation) -> FlattenedQueryTree { match op { - And(ops) => { - ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) - }, - Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { - vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] - } else { - ops.iter().map(recurse).flatten().collect() - }, + And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), + Or(_, ops) => { + if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.iter().map(recurse).flatten().collect() + } + } Phrase(words) => { - let queries = words.iter().map(|word| { - vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] - }).collect(); + let queries = words + .iter() + .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) + .collect(); vec![queries] } Operation::Query(query) => vec![vec![vec![query.clone()]]], @@ -712,28 +809,43 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { mod tests { use big_s::S; - use crate::search::criteria::QueryKind; use super::*; + use crate::search::criteria::QueryKind; #[test] fn simple_flatten_query_tree() { - let query_tree = Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), + let query_tree = Operation::Or( + false, + vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), ]), - ]), - ]); + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("thefish")), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("the")), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("fish")), + }), + ]), + ], + ), + ]), + ], + ); let expected = vec![ vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index eb44b7b8e..1e4d4e7a2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -2,19 +2,15 @@ use std::convert::TryFrom; use std::mem::take; use std::ops::BitOr; +use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; -use itertools::Itertools; -use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::criteria::{ - Context, - Criterion, - CriterionParameters, - CriterionResult, - resolve_query_tree, + resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; -use crate::{TreeLevel, Result}; +use crate::search::query_tree::{Operation, PrimitiveQueryPart}; +use crate::{Result, TreeLevel}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -26,7 +22,11 @@ pub struct Exactness<'t> { } impl<'t> Exactness<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box, primitive_query: &[PrimitiveQueryPart]) -> heed::Result { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + primitive_query: &[PrimitiveQueryPart], + ) -> heed::Result { let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); for part in primitive_query { query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); @@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> { // reset state self.state = None; self.query_tree = None; - }, + } Some(state) => { let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; self.state = state; @@ -70,40 +70,51 @@ impl<'t> Criterion for Exactness<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - self.state = Some(State::new(candidates)); - self.query_tree = Some(query_tree); - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some(State::new(candidates)); + self.query_tree = Some(query_tree); } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } - } + } } } @@ -125,9 +136,9 @@ impl State { fn difference_with(&mut self, lhs: &RoaringBitmap) { match self { - Self::ExactAttribute(candidates) | - Self::AttributeStartsWith(candidates) | - Self::ExactWords(candidates) => *candidates -= lhs, + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => *candidates -= lhs, Self::Remainings(candidates_array) => { candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); candidates_array.retain(|candidates| !candidates.is_empty()); @@ -137,9 +148,9 @@ impl State { fn is_empty(&self) -> bool { match self { - Self::ExactAttribute(candidates) | - Self::AttributeStartsWith(candidates) | - Self::ExactWords(candidates) => candidates.is_empty(), + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => candidates.is_empty(), Self::Remainings(candidates_array) => { candidates_array.iter().all(RoaringBitmap::is_empty) } @@ -158,8 +169,7 @@ fn resolve_state( ctx: &dyn Context, state: State, query: &[ExactQueryPart], -) -> Result<(RoaringBitmap, Option)> -{ +) -> Result<(RoaringBitmap, Option)> { use State::*; match state { ExactAttribute(mut allowed_candidates) => { @@ -167,8 +177,11 @@ fn resolve_state( if let Ok(query_len) = u8::try_from(query.len()) { let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { - let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + if let Some(attribute_allowed_docids) = + ctx.field_id_word_count_docids(id, query_len)? + { + let mut attribute_candidates_array = + attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -181,12 +194,13 @@ fn resolve_state( } Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - }, + } AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + let attribute_candidates_array = + attribute_start_with_docids(ctx, id as u32, query)?; candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -195,7 +209,7 @@ fn resolve_state( // remove current candidates from allowed candidates allowed_candidates -= &candidates; Ok((candidates, Some(ExactWords(allowed_candidates)))) - }, + } ExactWords(mut allowed_candidates) => { let number_of_part = query.len(); let mut parts_candidates_array = Vec::with_capacity(number_of_part); @@ -210,7 +224,7 @@ fn resolve_state( candidates |= synonym_candidates; } } - }, + } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); @@ -220,8 +234,8 @@ fn resolve_state( Some(docids) => bitmaps.push(docids), None => { bitmaps.clear(); - break - }, + break; + } } } } @@ -247,7 +261,7 @@ fn resolve_state( // intersect each word candidates in combinations .map(intersection_of) // union combinations of `c_count` exact words - .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + .fold(RoaringBitmap::new(), RoaringBitmap::bitor); // only keep allowed candidates combinations_candidates &= &allowed_candidates; // remove current candidates from allowed candidates @@ -261,7 +275,7 @@ fn resolve_state( candidates_array.reverse(); Ok((all_exact_candidates, Some(Remainings(candidates_array)))) - }, + } // pop remainings candidates until the emptiness Remainings(mut candidates_array) => { let candidates = candidates_array.pop().unwrap_or_default(); @@ -270,12 +284,15 @@ fn resolve_state( } else { Ok((candidates, None)) } - }, - + } } } -fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result> { +fn attribute_start_with_docids( + ctx: &dyn Context, + attribute_id: u32, + query: &[ExactQueryPart], +) -> heed::Result> { let lowest_level = TreeLevel::min_value(); let mut attribute_candidates_array = Vec::new(); // start from attribute first position @@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex } attribute_candidates_array.push(synonyms_candidates); pos += 1; - }, + } Phrase(phrase) => { for word in phrase { let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; @@ -325,24 +342,30 @@ pub enum ExactQueryPart { } impl ExactQueryPart { - fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result { + fn from_primitive_query_part( + ctx: &dyn Context, + part: &PrimitiveQueryPart, + ) -> heed::Result { let part = match part { PrimitiveQueryPart::Word(word, _) => { match ctx.synonyms(word)? { Some(synonyms) => { - let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { - // keep 1 word synonyms only. - match array.pop() { - Some(word) if array.is_empty() => Some(word), - _ => None, - } - }).collect(); + let mut synonyms: Vec<_> = synonyms + .into_iter() + .filter_map(|mut array| { + // keep 1 word synonyms only. + match array.pop() { + Some(word) if array.is_empty() => Some(word), + _ => None, + } + }) + .collect(); synonyms.push(word.clone()); ExactQueryPart::Synonyms(synonyms) - }, + } None => ExactQueryPart::Synonyms(vec![word.clone()]), } - }, + } PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), }; diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 645a3a5d7..bd3244143 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,10 +1,10 @@ use log::debug; use roaring::RoaringBitmap; -use crate::Result; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; +use crate::Result; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -26,7 +26,12 @@ pub struct Final<'t> { impl<'t> Final<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { - Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() } + Final { + ctx, + parent, + wdcache: WordDerivationsCache::new(), + returned_candidates: RoaringBitmap::new(), + } } #[logging_timer::time("Final::{}")] @@ -40,10 +45,17 @@ impl<'t> Final<'t> { }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, + (None, Some(qt)) => { + resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates + } (None, None) => self.ctx.documents_ids()? - excluded_candidates, }; @@ -56,7 +68,7 @@ impl<'t> Final<'t> { self.returned_candidates |= &candidates; Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) - }, + } None => Ok(None), } } diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index e6d0a17f7..514dbff96 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,15 +1,18 @@ use roaring::RoaringBitmap; -use crate::Result; +use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; -use super::{Criterion, CriterionResult, CriterionParameters}; +use crate::Result; pub struct Initial { - answer: Option + answer: Option, } impl Initial { - pub fn new(query_tree: Option, filtered_candidates: Option) -> Initial { + pub fn new( + query_tree: Option, + filtered_candidates: Option, + ) -> Initial { let answer = CriterionResult { query_tree, candidates: None, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 48af0b8aa..228d48bd7 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,29 +1,28 @@ -use std::collections::HashMap; use std::borrow::Cow; +use std::collections::HashMap; use roaring::RoaringBitmap; -use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; -use crate::{Index, DocumentId, Result}; - -use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; use self::attribute::Attribute; use self::exactness::Exactness; -use self::r#final::Final; use self::initial::Initial; use self::proximity::Proximity; +use self::r#final::Final; use self::typo::Typo; use self::words::Words; +use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; mod attribute; mod exactness; +pub mod r#final; mod initial; mod proximity; mod typo; mod words; -pub mod r#final; pub trait Criterion { fn next(&mut self, params: &mut CriterionParameters) -> Result>; @@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> { #[derive(Debug)] enum Candidates { Allowed(RoaringBitmap), - Forbidden(RoaringBitmap) + Forbidden(RoaringBitmap), } impl Default for Candidates { @@ -68,17 +67,55 @@ pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; - fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; - fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result>; + fn word_position_iterator( + &self, + word: &str, + level: TreeLevel, + in_prefix_cache: bool, + left: Option, + right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + 'c, + >, + >; + fn word_position_last_level( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> Result>; - fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result>; + fn searchable_fields_ids(&self) -> Result>; + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result>; + fn word_level_position_docids( + &self, + word: &str, + level: TreeLevel, + left: u32, + right: u32, + ) -> heed::Result>; } pub struct CriteriaBuilder<'t> { @@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_prefix_docids.get(self.rtxn, &word) } - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left, right, proximity); self.index.word_pair_proximity_docids.get(self.rtxn, &key) } - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left, right, proximity); self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } @@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.words_prefixes_fst.contains(word) } - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { let mut words_positions = HashMap::new(); for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { let ((_, word), positions) = result?; @@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { level: TreeLevel, in_prefix_cache: bool, left: Option, - right: Option - ) -> heed::Result> + 'c>> - { + right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + 'c, + >, + > { let range = { let left = left.unwrap_or(u32::min_value()); let right = right.unwrap_or(u32::max_value()); @@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(Box::new(db.range(self.rtxn, &range)?)) } - fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result> { + fn word_position_last_level( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result> { let range = { let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); @@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { }; let last_level = db .remap_data_type::() - .range(self.rtxn, &range)?.last().transpose()? + .range(self.rtxn, &range)? + .last() + .transpose()? .map(|((_, level, _, _), _)| level); Ok(last_level) @@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } } - fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result> { + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result> { let key = (field_id, word_count); self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result> { + fn word_level_position_docids( + &self, + word: &str, + level: TreeLevel, + left: u32, + right: u32, + ) -> heed::Result> { let key = (word, level, left, right); self.index.word_level_position_docids.get(self.rtxn, &key) } @@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - ) -> Result> - { + ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box; + let mut criterion = + Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Typo => Box::new(Typo::new(self, criterion)), @@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> { Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), - Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), + Name::Asc(field) => { + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) + } + Name::Desc(field) => { + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) + } }; } @@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, - ) -> Result - { - use Operation::{And, Phrase, Or, Query}; + ) -> Result { + use Operation::{And, Or, Phrase, Query}; match query_tree { And(ops) => { - let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, wdcache) - }).collect::>>()?; + let mut ops = ops + .iter() + .map(|op| resolve_operation(ctx, op, wdcache)) + .collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>( } } Ok(candidates) - }, + } Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; @@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>( } else { candidates &= pair_docids; } - }, - None => return Ok(RoaringBitmap::new()) + } + None => return Ok(RoaringBitmap::new()), } } Ok(candidates) - }, + } Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>( candidates.union_with(&docids); } Ok(candidates) - }, + } Query(q) => Ok(query_docids(ctx, q, wdcache)?), } } @@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } - fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], right_words: &[(U, u8)], - proximity: u8 -) -> Result -{ + proximity: u8, +) -> Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + let current_docids = ctx + .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); docids.union_with(¤t_docids); } } @@ -319,8 +391,7 @@ fn query_docids( ctx: &dyn Context, query: &Query, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { match &query.kind { QueryKind::Exact { word, .. } => { if query.prefix && ctx.in_prefix_cache(&word) { @@ -336,7 +407,7 @@ fn query_docids( } else { Ok(ctx.word_docids(&word)?.unwrap_or_default()) } - }, + } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); @@ -345,7 +416,7 @@ fn query_docids( docids.union_with(¤t_docids); } Ok(docids) - }, + } } } @@ -355,8 +426,7 @@ fn query_pair_proximity_docids( right: &Query, proximity: u8, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { if proximity >= 8 { let mut candidates = query_docids(ctx, left, wdcache)?; let right_candidates = query_docids(ctx, right, wdcache)?; @@ -368,20 +438,31 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && ctx.in_prefix_cache(&right) { - Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + Ok(ctx + .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? + .unwrap_or_default()) } else if prefix { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { - Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + Ok(ctx + .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? + .unwrap_or_default()) } - }, + } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); + let l_words = + word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + let current_docids = ctx + .word_prefix_pair_proximity_docids( + left.as_ref(), + right.as_ref(), + proximity, + )? + .unwrap_or_default(); docids.union_with(¤t_docids); } Ok(docids) @@ -391,28 +472,36 @@ fn query_pair_proximity_docids( } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } - }, + } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) - }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + } + ( + QueryKind::Tolerant { typo: l_typo, word: left }, + QueryKind::Tolerant { typo: r_typo, word: right }, + ) => { + let l_words = + word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) - }, + } } } #[cfg(test)] pub mod test { - use maplit::hashmap; - use rand::{Rng, SeedableRng, rngs::StdRng}; - - use super::*; use std::collections::HashMap; - fn s(s: &str) -> String { s.to_string() } + use maplit::hashmap; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + use super::*; + + fn s(s: &str) -> String { + s.to_string() + } pub struct TestContext<'t> { words_fst: fst::Set>, word_docids: HashMap, @@ -435,12 +524,22 @@ pub mod test { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left.to_string(), right.to_string(), proximity.into()); Ok(self.word_pair_proximity_docids.get(&key).cloned()) } - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left.to_string(), right.to_string(), proximity.into()); Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) } @@ -453,24 +552,44 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { if let Some(docid_words) = self.docid_words.get(&docid) { Ok(docid_words .iter() .enumerate() - .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) - .collect() - ) + .map(|(i, w)| { + (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))) + }) + .collect()) } else { Ok(HashMap::new()) } } - fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option, _right: Option) -> heed::Result> + 'c>> { + fn word_position_iterator( + &self, + _word: &str, + _level: TreeLevel, + _in_prefix_cache: bool, + _left: Option, + _right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + + 'c, + >, + > { todo!() } - fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { + fn word_position_last_level( + &self, + _word: &str, + _in_prefix_cache: bool, + ) -> heed::Result> { todo!() } @@ -478,15 +597,25 @@ pub mod test { todo!() } - fn searchable_fields_ids(&self) -> Result> { + fn searchable_fields_ids(&self) -> Result> { todo!() } - fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result> { + fn word_level_position_docids( + &self, + _word: &str, + _level: TreeLevel, + _left: u32, + _right: u32, + ) -> heed::Result> { todo!() } - fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result> { + fn field_id_word_count_docids( + &self, + _field_id: FieldId, + _word_count: u8, + ) -> heed::Result> { todo!() } } @@ -506,7 +635,7 @@ pub mod test { RoaringBitmap::from_sorted_iter(values.into_iter()) } - let word_docids = hashmap!{ + let word_docids = hashmap! { s("hello") => random_postings(rng, 1500), s("hi") => random_postings(rng, 4000), s("word") => random_postings(rng, 2500), @@ -530,7 +659,7 @@ pub mod test { } } - let word_prefix_docids = hashmap!{ + let word_prefix_docids = hashmap! { s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], @@ -540,7 +669,9 @@ pub mod test { let mut word_prefix_pair_proximity_docids = HashMap::new(); for (lword, lcandidates) in &word_docids { for (rword, rcandidates) in &word_docids { - if lword == rword { continue } + if lword == rword { + continue; + } let candidates = lcandidates & rcandidates; for candidate in candidates { if let Some(docid_words) = docid_words.get(&candidate) { @@ -551,24 +682,31 @@ pub mod test { } else { (s(lword), s(rword), (lposition - rposition + 1) as i32) }; - let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + let docids = word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); docids.push(candidate); } } } for (pword, pcandidates) in &word_prefix_docids { - if lword.starts_with(pword) { continue } + if lword.starts_with(pword) { + continue; + } let candidates = lcandidates & pcandidates; for candidate in candidates { if let Some(docid_words) = docid_words.get(&candidate) { let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + let rposition = + docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); let key = if lposition < rposition { (s(lword), s(pword), (rposition - lposition) as i32) } else { (s(lword), s(pword), (lposition - rposition + 1) as i32) }; - let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); docids.push(candidate); } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index c3c8027cb..3e8196e93 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap}; use std::collections::hash_map::HashMap; use std::mem::take; -use roaring::RoaringBitmap; use log::debug; +use roaring::RoaringBitmap; -use crate::search::query_tree::{maximum_proximity, Operation, Query}; -use crate::search::{build_dfa, WordDerivationsCache}; -use crate::search::{query_tree::QueryKind}; -use crate::{DocumentId, Position, Result}; use super::{ - Context, - Criterion, - CriterionParameters, - CriterionResult, - query_docids, - query_pair_proximity_docids, - resolve_query_tree, + query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, + CriterionParameters, CriterionResult, }; +use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; +use crate::search::{build_dfa, WordDerivationsCache}; +use crate::{DocumentId, Position, Result}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; @@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> { } loop { - debug!("Proximity at iteration {} (max prox {:?}) ({:?})", + debug!( + "Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, self.state.as_ref().map(|(mp, _, _)| mp), self.state.as_ref().map(|(_, _, cd)| cd), ); match &mut self.state { - Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { + Some((max_prox, _, allowed_candidates)) + if allowed_candidates.is_empty() || self.proximity > *max_prox => + { self.state = None; // reset state - }, + } Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { + let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD + && self.proximity > PROXIMITY_THRESHOLD + { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { Some((p, candidates)) => { self.proximity = p; candidates - }, + } None => { self.state = None; // reset state - continue - }, + continue; + } } } else { let cache = resolve_plane_sweep_candidates( @@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> { )?; self.plane_sweep_cache = Some(cache.into_iter()); - continue + continue; } - } else { // use set theory based algorithm + } else { + // use set theory based algorithm resolve_candidates( self.ctx, &query_tree, @@ -117,39 +117,50 @@ impl<'t> Criterion for Proximity<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - let maximum_proximity = maximum_proximity(&query_tree); - self.state = Some((maximum_proximity as u8, query_tree, candidates)); - self.proximity = 0; - self.plane_sweep_cache = None; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -162,46 +173,48 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { - use Operation::{And, Phrase, Or}; + ) -> Result> { + use Operation::{And, Or, Phrase}; let result = match query_tree { And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Phrase(words) => if proximity == 0 { - let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = None; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - match candidates.as_mut() { + Phrase(words) => { + if proximity == 0 { + let most_left = words + .first() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let most_right = words + .last() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let mut candidates = None; + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => match candidates.as_mut() { Some(candidates) => *candidates &= pair_docids, None => candidates = Some(pair_docids), + }, + None => { + candidates = None; + break; } - }, - None => { - candidates = None; - break; } } + match (most_left, most_right, candidates) { + (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + _otherwise => Default::default(), + } + } else { + Default::default() } - match (most_left, most_right, candidates) { - (Some(l), Some(r), Some(c)) => vec![(l, r, c)], - _otherwise => Default::default(), - } - } else { - Default::default() - }, + } Or(_, ops) => { let mut output = Vec::new(); for op in ops { @@ -209,13 +222,15 @@ fn resolve_candidates<'t>( output.extend(result); } output - }, - Operation::Query(q) => if proximity == 0 { - let candidates = query_docids(ctx, q, wdcache)?; - vec![(q.clone(), q.clone(), candidates)] - } else { - Default::default() - }, + } + Operation::Query(q) => { + if proximity == 0 { + let candidates = query_docids(ctx, q, wdcache)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + } + } }; Ok(result) @@ -228,8 +243,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { (0..=mana.min(left_max)).map(move |m| (m, mana - m)) } @@ -257,7 +271,8 @@ fn resolve_candidates<'t>( for (ll, lr, lcandidates) in lefts { for (rl, rr, rcandidates) in rights { - let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; + let mut candidates = + query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; if lcandidates.len() < rcandidates.len() { candidates.intersect_with(lcandidates); candidates.intersect_with(rcandidates); @@ -282,22 +297,26 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { // Extract the first two elements but gives the tail // that is just after the first element. - let next = branches.split_first().map(|(h1, t)| { - (h1, t.split_first().map(|(h2, _)| (h2, t))) - }); + let next = + branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); match next { - Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache), + Some((head1, Some((head2, [_])))) => { + mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) + } Some((head1, Some((head2, tail)))) => { let mut output = Vec::new(); for p in 0..=proximity { - for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? { + for (lhead, _, head_candidates) in + mdfs_pair(ctx, head1, head2, p, cache, wdcache)? + { if !head_candidates.is_empty() { - for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { + for (_, rtail, mut candidates) in + mdfs(ctx, tail, proximity - p, cache, wdcache)? + { candidates.intersect_with(&head_candidates); if !candidates.is_empty() { output.push((lhead.clone(), rtail, candidates)); @@ -307,7 +326,7 @@ fn resolve_candidates<'t>( } } Ok(output) - }, + } Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), None => Ok(Default::default()), } @@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates( query_tree: &Operation, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> Result> -{ +) -> Result> { /// FIXME may be buggy with query like "new new york" fn plane_sweep( groups_positions: Vec>, consecutive: bool, - ) -> Result> - { + ) -> Result> { fn compute_groups_proximity( groups: &[(usize, (Position, u8, Position))], consecutive: bool, - ) -> Option<(Position, u8, Position)> - { + ) -> Option<(Position, u8, Position)> { // take the inner proximity of the first group as initial let (_, (_, mut proximity, _)) = groups.first()?; let (_, (left_most_pos, _, _)) = groups.first()?; - let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; + let (_, (_, _, right_most_pos)) = + groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; for pair in groups.windows(2) { if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { // if two positions are equal, meaning that they share at least a word, we return None if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { - return None + return None; } let pair_proximity = { // if intervals are disjoint [..].(..) - if lpos2 > rpos1 { lpos2 - rpos1 } + if lpos2 > rpos1 { + lpos2 - rpos1 + } // if the second interval is a subset of the first [.(..).] - else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) } + else if rpos2 < rpos1 { + (lpos2 - lpos1).min(rpos1 - rpos2) + } // if intervals overlaps [.(..].) - else { (lpos2 - lpos1).min(rpos2 - rpos1) } + else { + (lpos2 - lpos1).min(rpos2 - rpos1) + } }; // if groups are in the good order (query order) we remove 1 to the proximity // the proximity is clamped to 7 - let pair_proximity = if i1 < i2 { - (pair_proximity - 1).min(7) - } else { - pair_proximity.min(7) - }; + let pair_proximity = + if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; proximity += pair_proximity as u8 + prox2; } @@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates( let groups_len = groups_positions.len(); - let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); + let mut groups_positions: Vec<_> = + groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); // Pop top elements of each list. let mut current = Vec::with_capacity(groups_len); @@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates( rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, words_positions: &HashMap, wdcache: &mut WordDerivationsCache, - ) -> Result> - { - use Operation::{And, Phrase, Or}; + ) -> Result> { + use Operation::{And, Or, Phrase}; if let Some(result) = rocache.get(query_tree) { return Ok(result.clone()); @@ -462,13 +482,20 @@ fn resolve_plane_sweep_candidates( let result = match query_tree { And(ops) => { - let mut groups_positions = Vec::with_capacity(ops.len()); + let mut groups_positions = Vec::with_capacity(ops.len()); for operation in ops { - let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; + let positions = resolve_operation( + ctx, + operation, + docid, + rocache, + words_positions, + wdcache, + )?; groups_positions.push(positions); } plane_sweep(groups_positions, false)? - }, + } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); for word in words { @@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates( groups_positions.push(positions); } plane_sweep(groups_positions, true)? - }, + } Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) + result.extend(resolve_operation( + ctx, + op, + docid, + rocache, + words_positions, + wdcache, + )?) } result.sort_unstable(); result - }, + } Operation::Query(Query { prefix, kind }) => { let mut result = Vec::new(); match kind { @@ -498,9 +532,9 @@ fn resolve_plane_sweep_candidates( .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } else if let Some(positions) = words_positions.get(word) { - result.extend(positions.iter().map(|p| (p, 0, p))); + result.extend(positions.iter().map(|p| (p, 0, p))); } - }, + } QueryKind::Tolerant { typo, word } => { let iter = word_derivations(word, *prefix, *typo, &words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); @@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates( is_prefix: bool, max_typo: u8, words_positions: &'a HashMap, - ) -> impl Iterator - { + ) -> impl Iterator { let dfa = build_dfa(word, max_typo, is_prefix); words_positions.iter().filter_map(move |(document_word, positions)| { use levenshtein_automata::Distance; @@ -539,7 +572,7 @@ fn resolve_plane_sweep_candidates( for docid in allowed_candidates { let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); - let positions = resolve_operation( + let positions = resolve_operation( ctx, query_tree, docid, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 436f4affd..f4ae15f0a 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,20 +1,17 @@ -use std::{borrow::Cow, collections::HashMap, mem::take}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use super::{ + query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, + CriterionResult, +}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; -use super::{ - Candidates, - Context, - Criterion, - CriterionParameters, - CriterionResult, - query_docids, - resolve_query_tree, -}; /// Maximum number of typo for a word of any length. const MAX_TYPOS_PER_WORD: u8 = 2; @@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> { } loop { - debug!("Typo at iteration {} (max typos {:?}) ({:?})", + debug!( + "Typo at iteration {} (max typos {:?}) ({:?})", self.typos, self.state.as_ref().map(|(mt, _, _)| mt), self.state.as_ref().map(|(_, _, cd)| cd), @@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> { match self.state.as_mut() { Some((max_typos, _, _)) if self.typos > *max_typos => { self.state = None; // reset state - }, + } Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { self.state = None; // reset state - }, + } Some((_, query_tree, candidates_authorization)) => { let fst = self.ctx.words_fst(); let new_query_tree = match self.typos { - typos if typos < MAX_TYPOS_PER_WORD => { - alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? - }, + typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( + &fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?, MAX_TYPOS_PER_WORD => { // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, // we keep the altered query tree - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + *query_tree = alterate_query_tree( + &fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?; // we compute the allowed candidates - let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + let query_tree_allowed_candidates = + resolve_query_tree(self.ctx, query_tree, params.wdcache)?; // we assign the allowed candidates to the candidates authorization. *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), - Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), + Allowed(allowed_candidates) => { + Allowed(query_tree_allowed_candidates & allowed_candidates) + } + Forbidden(forbidden_candidates) => { + Allowed(query_tree_allowed_candidates - forbidden_candidates) + } }; query_tree.clone() - }, + } _otherwise => query_tree.clone(), }; @@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> { Allowed(allowed_candidates) => { candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; - }, + } Forbidden(forbidden_candidates) => { candidates -= &*forbidden_candidates; *forbidden_candidates |= &candidates; - }, + } } let bucket_candidates = match self.bucket_candidates.as_mut() { @@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> { filtered_candidates: None, bucket_candidates: Some(bucket_candidates), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.bucket_candidates = + match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - let candidates = match candidates.or(filtered_candidates) { - Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), - None => Candidates::Forbidden(params.excluded_candidates.clone()), - }; + let candidates = match candidates.or(filtered_candidates) { + Some(candidates) => { + Candidates::Allowed(candidates - params.excluded_candidates) + } + None => Candidates::Forbidden(params.excluded_candidates.clone()), + }; - let maximum_typos = maximum_typo(&query_tree) as u8; - self.state = Some((maximum_typos, query_tree, candidates)); - self.typos = 0; - - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -164,21 +185,19 @@ fn alterate_query_tree( mut query_tree: Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, - ) -> Result<()> - { - use Operation::{And, Phrase, Or}; + ) -> Result<()> { + use Operation::{And, Or, Phrase}; match operation { And(ops) | Or(_, ops) => { ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) - }, + } // Because Phrases don't allow typos, no alteration can be done. Phrase(_words) => return Ok(()), Operation::Query(q) => { @@ -193,19 +212,25 @@ fn alterate_query_tree( } else { let typo = *typo.min(&number_typos); let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words.iter().map(|(word, typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, + let queries = words + .iter() + .map(|(word, typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { + original_typo: *typo, + word: word.to_string(), + }, + }) }) - }).collect(); + .collect(); *operation = Operation::or(false, queries); } } Ok(()) - }, + } } } @@ -219,22 +244,18 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> Result - { - use Operation::{And, Phrase, Or, Query}; + ) -> Result { + use Operation::{And, Or, Phrase, Query}; match query_tree { - And(ops) => { - mdfs(ctx, ops, number_typos, cache, wdcache) - }, + And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; @@ -250,12 +271,12 @@ fn resolve_candidates<'t>( } else { candidates &= pair_docids; } - }, - None => return Ok(RoaringBitmap::new()) + } + None => return Ok(RoaringBitmap::new()), } } Ok(candidates) - }, + } Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -263,12 +284,14 @@ fn resolve_candidates<'t>( candidates.union_with(&docids); } Ok(candidates) - }, - Query(q) => if q.kind.typo() == number_typos { - Ok(query_docids(ctx, q, wdcache)?) - } else { - Ok(RoaringBitmap::new()) - }, + } + Query(q) => { + if q.kind.typo() == number_typos { + Ok(query_docids(ctx, q, wdcache)?) + } else { + Ok(RoaringBitmap::new()) + } + } } } @@ -278,8 +301,7 @@ fn resolve_candidates<'t>( mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> Result - { + ) -> Result { match branches.split_first() { Some((head, [])) => { let cache_key = (head.clone(), mana); @@ -290,7 +312,7 @@ fn resolve_candidates<'t>( cache.insert(cache_key, candidates.clone()); Ok(candidates) } - }, + } Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); @@ -313,7 +335,7 @@ fn resolve_candidates<'t>( } Ok(candidates) - }, + } None => Ok(RoaringBitmap::new()), } } @@ -323,9 +345,9 @@ fn resolve_candidates<'t>( #[cfg(test)] mod test { - use super::*; use super::super::initial::Initial; use super::super::test::TestContext; + use super::*; #[test] fn initial_placeholder_no_facets() { @@ -348,13 +370,23 @@ mod test { #[test] fn initial_query_tree_no_facets() { let context = TestContext::default(); - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]) - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let facet_candidates = None; @@ -369,13 +401,23 @@ mod test { & context.word_docids("this").unwrap().unwrap() & context.word_docids("world").unwrap().unwrap(); let expected_1 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ])], + )), candidates: Some(candidates_1.clone()), bucket_candidates: Some(candidates_1), filtered_candidates: None, @@ -383,22 +425,37 @@ mod test { assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); - let candidates_2 = ( - context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap() - ) - context.word_docids("world").unwrap().unwrap(); + let candidates_2 = (context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap()) + - context.word_docids("world").unwrap().unwrap(); let expected_2 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact_with_typo(1, "word".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ], + ), + ])], + )), candidates: Some(candidates_2.clone()), bucket_candidates: Some(candidates_2), filtered_candidates: None, @@ -437,17 +494,26 @@ mod test { #[test] fn initial_query_tree_with_facets() { let context = TestContext::default(); - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]) - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), @@ -459,13 +525,23 @@ mod test { & context.word_docids("this").unwrap().unwrap() & context.word_docids("world").unwrap().unwrap(); let expected_1 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ])], + )), candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: Some(&candidates_1 & &facet_candidates), filtered_candidates: None, @@ -473,22 +549,37 @@ mod test { assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); - let candidates_2 = ( - context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap() - ) - context.word_docids("world").unwrap().unwrap(); + let candidates_2 = (context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap()) + - context.word_docids("world").unwrap().unwrap(); let expected_2 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact_with_typo(1, "word".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ], + ), + ])], + )), candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: Some(&candidates_2 & &facet_candidates), filtered_candidates: None, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index add90d80d..ccc6c0617 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -3,9 +3,9 @@ use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; use crate::Result; -use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; pub struct Words<'t> { ctx: &'t dyn Context<'t>, @@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> { Some(query_tree) => { let candidates = match self.candidates.as_mut() { Some(allowed_candidates) => { - let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + let mut candidates = + resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; Some(candidates) - }, + } None => None, }; @@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> { filtered_candidates: self.filtered_candidates.clone(), bucket_candidates, })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - self.query_trees = explode_query_tree(query_tree); - self.candidates = candidates; - self.filtered_candidates = filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.filtered_candidates = filtered_candidates; - self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + self.bucket_candidates = + match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index b9ffd9d90..290a7602f 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -3,11 +3,11 @@ use std::mem::size_of; use heed::types::ByteSlice; use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::error::InternalError; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; -use super::{Distinct, DocIter}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> { impl<'a> FacetDistinct<'a> { pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - Self { - distinct, - index, - txn, - } + Self { distinct, index, txn } } } @@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index - .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) + self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index - .facet_id_f64_docids - .get(self.txn, &(self.distinct, 0, key, key)) + self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { @@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> { for item in iter { let ((_, _, value), _) = item?; - let facet_docids = self - .facet_string_docids(value)? - .ok_or(InternalError::DatabaseMissingEntry { + let facet_docids = + self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::FACET_ID_STRING_DOCIDS, key: None, })?; @@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> { for item in iter { let ((_, _, value), _) = item?; - let facet_docids = self - .facet_number_docids(value)? - .ok_or(InternalError::DatabaseMissingEntry { + let facet_docids = + self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::FACET_ID_F64_DOCIDS, key: None, })?; diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 99bc74be0..ae3fdb91e 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -1,11 +1,11 @@ mod facet_distinct; mod noop_distinct; +pub use facet_distinct::FacetDistinct; +pub use noop_distinct::NoopDistinct; use roaring::RoaringBitmap; use crate::{DocumentId, Result}; -pub use facet_distinct::FacetDistinct; -pub use noop_distinct::NoopDistinct; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. @@ -29,13 +29,15 @@ mod test { use std::collections::HashSet; use once_cell::sync::Lazy; - use rand::{seq::SliceRandom, Rng}; + use rand::seq::SliceRandom; + use rand::Rng; use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::index::{Index, tests::TempIndex}; + use crate::index::tests::TempIndex; + use crate::index::Index; use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; - use crate::{BEU32, FieldId, DocumentId}; + use crate::{DocumentId, FieldId, BEU32}; static JSON: Lazy = Lazy::new(generate_json); @@ -89,9 +91,7 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition - .execute(JSON.to_string().as_bytes(), |_, _| ()) - .unwrap(); + addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); @@ -103,13 +103,12 @@ mod test { (index, fid, map) } - /// Checks that all the candidates are distinct, and returns the candidates number. pub(crate) fn validate_distinct_candidates( candidates: impl Iterator>, distinct: FieldId, index: &Index, - ) -> usize { + ) -> usize { fn test(seen: &mut HashSet, value: &Value) { match value { Value::Null | Value::Object(_) | Value::Bool(_) => (), @@ -117,7 +116,7 @@ mod test { let s = value.to_string(); assert!(seen.insert(s)); } - Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} + Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)), } } diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 812701794..96a1f7d5d 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,7 +1,8 @@ -use roaring::{RoaringBitmap, bitmap::IntoIter}; +use roaring::bitmap::IntoIter; +use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::{DocumentId, Result}; -use super::{DocIter, Distinct}; /// A distinct implementer that does not perform any distinct, /// and simply returns an iterator to the candidates. @@ -30,10 +31,7 @@ impl Distinct for NoopDistinct { type Iter = NoopDistinctIter; fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - NoopDistinctIter { - candidates: candidates.into_iter(), - excluded, - } + NoopDistinctIter { candidates: candidates.into_iter(), excluded } } } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 265a8ffeb..0a2036494 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,16 +1,16 @@ -use std::collections::{HashSet, BTreeMap}; +use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; -use heed::{Database, BytesDecode}; use heed::types::{ByteSlice, Unit}; +use heed::{BytesDecode, Database}; use roaring::RoaringBitmap; use crate::error::FieldIdMapMissingEntry; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; -use crate::{Index, FieldId, DocumentId, Result}; +use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -43,7 +43,7 @@ impl<'a> FacetDistribution<'a> { } } - pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { + pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); self } @@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> { facet_type: FacetType, candidates: &RoaringBitmap, distribution: &mut BTreeMap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { fn fetch_facet_values<'t, KC, K: 't>( rtxn: &'t heed::RoTxn, db: Database, @@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let db = self.index.field_id_docid_facet_f64s; fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) - }, + } FacetType::String => { let db = self.index.field_id_docid_facet_strings; fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) @@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> { field_id: FieldId, candidates: &RoaringBitmap, distribution: &mut BTreeMap, - ) -> heed::Result<()> - { - let iter = FacetIter::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; + ) -> heed::Result<()> { + let iter = + FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { let (value, mut docids) = result?; @@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> { fn facet_values_from_raw_facet_database( &self, field_id: FieldId, - ) -> heed::Result> - { + ) -> heed::Result> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; @@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> { } } - let iter = self.index + let iter = self + .index .facet_id_string_docids .remap_key_type::() .prefix_iter(self.rtxn, &[field_id])? @@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> { // to those candidates. We also enter here for facet strings for performance reasons. let mut distribution = BTreeMap::new(); if candidates.len() <= CANDIDATES_THRESHOLD { - self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; - self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; + self.facet_distribution_from_documents( + field_id, + Number, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; } else { - self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; - self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; } Ok(distribution) @@ -201,10 +217,11 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for name in filterable_fields { - let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.clone(), - process: "FacetDistribution::execute", - })?; + let fid = + fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.clone(), + process: "FacetDistribution::execute", + })?; let values = self.facet_values(fid)?; distribution.insert(name, values); } @@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { - facets, - candidates, - max_values_by_facet, - rtxn: _, - index: _, - } = self; + let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 424118f77..31fc6018c 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; use std::fmt::Debug; -use std::ops::Bound::{self, Included, Excluded}; +use std::ops::Bound::{self, Excluded, Included}; use std::result::Result as StdResult; use std::str::FromStr; @@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; -use crate::error::UserError; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result}; - -use super::FacetRange; -use super::parser::Rule; -use super::parser::{PREC_CLIMBER, FilterParser}; - use self::FilterCondition::*; use self::Operator::*; +use super::parser::{FilterParser, Rule, PREC_CLIMBER}; +use super::FacetRange; +use crate::error::UserError; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; +use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] pub enum Operator { @@ -39,13 +36,13 @@ impl Operator { /// an OR operation for the between case (i.e. `TO`). fn negate(self) -> (Self, Option) { match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThan(n) => (LowerThanOrEqual(n), None), GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), } } } @@ -63,10 +60,11 @@ impl FilterCondition { index: &Index, array: I, ) -> Result> - where I: IntoIterator>, - J: IntoIterator, - A: AsRef, - B: AsRef, + where + I: IntoIterator>, + J: IntoIterator, + A: AsRef, + B: AsRef, { let mut ands = None; @@ -88,7 +86,7 @@ impl FilterCondition { None => Some(rule), }; } - }, + } Either::Right(rule) => { let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; ands = match ands.take() { @@ -106,11 +104,11 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, expression: &str, - ) -> Result - { + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + let lexed = + FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -118,8 +116,7 @@ impl FilterCondition { fim: &FieldsIdsMap, ff: &HashSet, expression: Pairs, - ) -> Result - { + ) -> Result { PREC_CLIMBER.climb( expression, |pair: Pair| match pair.as_rule() { @@ -135,12 +132,10 @@ impl FilterCondition { Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), _ => unreachable!(), }, - |lhs: Result, op: Pair, rhs: Result| { - match op.as_rule() { - Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), - Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), - _ => unreachable!(), - } + |lhs: Result, op: Pair, rhs: Result| match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), }, ) } @@ -160,8 +155,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -179,8 +173,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -196,8 +189,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -213,8 +205,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -230,8 +221,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -247,8 +237,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -272,13 +261,14 @@ impl FilterCondition { left: Bound, right: Bound, output: &mut RoaringBitmap, - ) -> Result<()> - { + ) -> Result<()> { match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output); - }, + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } // lower TO upper when lower > upper must return no result (Included(l), Included(r)) if l > r => return Ok(()), (Included(l), Excluded(r)) if l >= r => return Ok(()), @@ -301,7 +291,9 @@ impl FilterCondition { debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); output.union_with(&docids); // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } + if i == 0 { + left_found = Some(l); + } right_found = Some(r); } @@ -318,20 +310,50 @@ impl FilterCondition { // If the bound is satisfied we avoid calling this function again. if !matches!(left, Included(l) if l == left_found) { let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + debug!( + "calling left with {:?} to {:?} (level {})", + left, sub_right, deeper_level + ); + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + left, + sub_right, + output, + )?; } if !matches!(right, Included(r) if r == right_found) { let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + debug!( + "calling right with {:?} to {:?} (level {})", + sub_left, right, deeper_level + ); + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + sub_left, + right, + output, + )?; } - }, + } None => { // If we found nothing at this level it means that we must find // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?; - }, + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + left, + right, + output, + )?; + } } Ok(()) @@ -344,27 +366,34 @@ impl FilterCondition { strings_db: heed::Database, field_id: FieldId, operator: &Operator, - ) -> Result - { + ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Equal(number, string) => { + Equal(number, string) => { let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + 0, + n, + n, + &mut output, + )?; output - }, + } None => RoaringBitmap::new(), }; return Ok(string_docids | number_docids); - }, + } NotEqual(number, string) => { let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? @@ -373,12 +402,14 @@ impl FilterCondition { }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Equal(*number, string.clone()); - let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; + let docids = Self::evaluate_operator( + rtxn, index, numbers_db, strings_db, field_id, &operator, + )?; return Ok((all_numbers_ids | all_strings_ids) - docids); - }, - LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + } + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Between(left, right) => (Included(*left), Included(*right)), + Between(left, right) => (Included(*left), Included(*right)), }; // Ask for the biggest value that can exist for this specific field, if it exists @@ -391,36 +422,39 @@ impl FilterCondition { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + level, + left, + right, + &mut output, + )?; Ok(output) - }, + } None => Ok(RoaringBitmap::new()), } } - pub fn evaluate( - &self, - rtxn: &heed::RoTxn, - index: &Index, - ) -> Result - { + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; match self { Operator(fid, op) => { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) - }, + } Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs | rhs) - }, + } And(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs & rhs) - }, + } } } } @@ -434,23 +468,24 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> StdResult> -{ +) -> StdResult> { // lexing ensures that we at least have a key let key = items.next().unwrap(); let field_id = match fields_ids_map.id(key.as_str()) { Some(field_id) => field_id, - None => return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), - ), - }, - key.as_span(), - )), + None => { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), + ), + }, + key.as_span(), + )) + } }; if !filterable_fields.contains(&field_id) { @@ -459,9 +494,11 @@ fn field_id( message: format!( "attribute `{}` is not filterable, available filterable attributes are: {}", key.as_str(), - filterable_fields.iter().flat_map(|id| { - fields_ids_map.name(*id) - }).collect::>().join(", "), + filterable_fields + .iter() + .flat_map(|id| { fields_ids_map.name(*id) }) + .collect::>() + .join(", "), ), }, key.as_span(), @@ -476,8 +513,9 @@ fn field_id( /// /// Returns the parsing error associated with the span if the conversion fails. fn pest_parse(pair: Pair) -> (StdResult>, String) -where T: FromStr, - T::Err: ToString, +where + T: FromStr, + T::Err: ToString, { let result = match pair.as_str().parse::() { Ok(value) => Ok(value), @@ -492,11 +530,12 @@ where T: FromStr, #[cfg(test)] mod tests { - use super::*; - use crate::update::Settings; + use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; - use big_s::S; + + use super::*; + use crate::update::Settings; #[test] fn string() { @@ -508,7 +547,7 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ S("channel") }); + builder.set_filterable_fields(hashset! { S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -537,7 +576,7 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ "timestamp".into() }); + builder.set_filterable_fields(hashset! { "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -548,10 +587,8 @@ mod tests { assert_eq!(condition, expected); let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = Or( - Box::new(Operator(0, LowerThan(22.0))), - Box::new(Operator(0, GreaterThan(44.0))), - ); + let expected = + Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); assert_eq!(condition, expected); } @@ -566,29 +603,33 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); + ) + .unwrap(); let expected = Or( Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(And( Box::new(Operator(1, Between(22.0, 44.0))), Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), - )) + )), ); assert_eq!(condition, expected); let condition = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); + ) + .unwrap(); let expected = Or( Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Or( @@ -613,20 +654,28 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_array( - &rtxn, &index, - vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], - ).unwrap().unwrap(); + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); let expected = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ).unwrap(); + ) + .unwrap(); assert_eq!(condition, expected); } } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index a1a03dba3..240d99ccc 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,20 +1,19 @@ -use std::ops::Bound::{self, Included, Excluded, Unbounded}; +use std::ops::Bound::{self, Excluded, Included, Unbounded}; use either::Either::{self, Left, Right}; -use heed::types::{DecodeIgnore, ByteSlice}; -use heed::{Database, RoRange, RoRevRange, LazyDecode}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; use roaring::RoaringBitmap; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{Index, FieldId}; - pub use self::facet_distribution::FacetDistribution; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; +use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; -mod filter_condition; mod facet_distribution; +mod filter_condition; mod parser; pub struct FacetRange<'t> { @@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> { level: u8, left: Bound, right: Bound, - ) -> heed::Result> - { + ) -> heed::Result> { let left_bound = match left { Included(left) => Included((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)), @@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> { } else { None } - }, + } Some(Err(e)) => Some(Err(e)), None => None, } @@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> { level: u8, left: Bound, right: Bound, - ) -> heed::Result> - { + ) -> heed::Result> { let left_bound = match left { Included(left) => Included((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)), @@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> { } } continue; - }, + } Some(Err(e)) => return Some(Err(e)), None => return None, } @@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } @@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Right(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } @@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) } @@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> { rtxn: &'t heed::RoTxn, db: Database, fid: FieldId, - ) -> heed::Result> - { - let level = db.remap_types::() + ) -> heed::Result> { + let level = db + .remap_types::() .prefix_iter(rtxn, &[fid][..])? .remap_key_type::() - .last().transpose()? + .last() + .transpose()? .map(|((_, level, _, _), _)| level); Ok(level) } @@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> { match result { Ok(((_fid, level, left, right), mut docids)) => { - docids.intersect_with(&documents_ids); if !docids.is_empty() { if self.must_reduce { @@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> { Ok(iter) => { self.level_iters.push((docids, iter)); continue 'outer; - }, + } Err(e) => return Some(Err(e)), } } - }, + } Err(e) => return Some(Err(e)), } } diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs index 0e8bd23ac..1bff27cfb 100644 --- a/milli/src/search/facet/parser.rs +++ b/milli/src/search/facet/parser.rs @@ -1,5 +1,5 @@ use once_cell::sync::Lazy; -use pest::prec_climber::{Operator, Assoc, PrecClimber}; +use pest::prec_climber::{Assoc, Operator, PrecClimber}; pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { use Assoc::*; diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index c56db4e96..cd8e404b8 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -1,13 +1,11 @@ -use std::collections::HashSet; use std::cmp::{min, Reverse}; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashSet}; use std::ops::{Index, IndexMut}; -use levenshtein_automata::{DFA, Distance}; - -use crate::search::query_tree::{Operation, Query}; +use levenshtein_automata::{Distance, DFA}; use super::build_dfa; +use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -28,7 +26,9 @@ impl MatchingWords { .collect(); // Sort word by len in DESC order prioritizing the longuest word, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { + Reverse(query_word.len()) + }); Self { dfas } } @@ -37,12 +37,13 @@ impl MatchingWords { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { Distance::Exact(t) if t <= *typo => { if *is_prefix { - let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); + let (_dist, len) = + prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); Some(len) } else { Some(word.len()) } - }, + } _otherwise => None, }) } @@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { match tree { Operation::Or(_, ops) | Operation::And(ops) => { ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); - }, + } Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; out.insert((kind.word(), typo, *prefix)); - }, + } Operation::Phrase(words) => { for word in words { out.insert((word, 0, false)); @@ -80,10 +81,7 @@ struct N2Array { impl N2Array { fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { - y_size: y, - buf: vec![value; x * y], - } + N2Array { y_size: y, buf: vec![value; x * y] } } } @@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { #[cfg(test)] mod tests { use super::*; - - use crate::MatchingWords; use crate::search::query_tree::{Operation, Query, QueryKind}; + use crate::MatchingWords; #[test] fn matched_length() { @@ -194,13 +191,23 @@ mod tests { #[test] fn matching_words() { - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]), - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let matching_words = MatchingWords::from_query_tree(&query_tree); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3c85796bc..f692df173 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,6 +6,7 @@ use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; @@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; +pub(crate) use self::facet::ParserRule; +pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; +pub use self::matching_words::MatchingWords; +use self::query_tree::QueryTreeBuilder; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{Index, DocumentId, Result}; - -pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; -pub use self::matching_words::MatchingWords; -pub(crate) use self::facet::ParserRule; -use self::query_tree::QueryTreeBuilder; - -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; +use crate::{DocumentId, Index, Result}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -32,8 +30,8 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; mod facet; -mod query_tree; mod matching_words; +mod query_tree; pub struct Search<'a> { query: Option, @@ -117,7 +115,7 @@ impl<'a> Search<'a> { let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) - }, + } None => (None, None), }; @@ -144,10 +142,11 @@ impl<'a> Search<'a> { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "distinct attribute", - })?; + let id = + field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "distinct attribute", + })?; let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } @@ -159,14 +158,15 @@ impl<'a> Search<'a> { mut distinct: D, matching_words: MatchingWords, mut criteria: Final, - ) -> Result - { + ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_candidates = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = + criteria.next(&excluded_candidates)? + { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_candidates); @@ -183,7 +183,9 @@ impl<'a> Search<'a> { for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { documents_ids.push(candidate?); } - if documents_ids.len() == self.limit { break } + if documents_ids.len() == self.limit { + break; + } excluded_candidates = candidates.into_excluded(); } @@ -247,7 +249,7 @@ pub fn word_derivations<'c>( } Ok(entry.insert(derived_words)) - }, + } } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c371b07d4..8fa24b9d3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,4 @@ -use std::{fmt, cmp, mem}; +use std::{cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -28,18 +28,18 @@ impl fmt::Debug for Operation { Operation::And(children) => { writeln!(f, "{:1$}AND", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Phrase(children) => { writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) - }, + } Operation::Or(true, children) => { writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Or(false, children) => { writeln!(f, "{:1$}OR", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), } } @@ -136,10 +136,12 @@ impl fmt::Debug for Query { match kind { QueryKind::Exact { word, .. } => { f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() - }, - QueryKind::Tolerant { typo, word } => { - f.debug_struct(&(prefix + "Tolerant")).field("word", &word).field("max typo", &typo).finish() - }, + } + QueryKind::Tolerant { typo, word } => f + .debug_struct(&(prefix + "Tolerant")) + .field("word", &word) + .field("max typo", &typo) + .finish(), } } } @@ -223,7 +225,12 @@ impl<'a> QueryTreeBuilder<'a> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { - let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?; + let qt = create_query_tree( + self, + self.optional_words, + self.authorize_typos, + &primitive_query, + )?; Ok(Some((qt, primitive_query))) } else { Ok(None) @@ -248,12 +255,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { match word.len() { 0..=4 => QueryKind::exact(word), 5..=8 => QueryKind::tolerant(1, word), - _ => QueryKind::tolerant(2, word), + _ => QueryKind::tolerant(2, word), } } else { QueryKind::exact(word) @@ -276,12 +278,18 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result Result -{ +) -> Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - ) -> Result - { + ) -> Result { match part { // 1. try to split word in 2 // 2. try to fetch synonyms @@ -310,13 +316,12 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + children + .push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); Ok(Operation::or(false, children)) - }, + } // create a CONSECUTIVE operation wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => { - Ok(Operation::phrase(words)) - }, + PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)), } } @@ -325,8 +330,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> Result - { + ) -> Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); @@ -341,21 +345,26 @@ fn create_query_tree( match group { [part] => { - let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - }, + } words => { let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words.iter().filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }).collect(); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; + let query = + Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); } @@ -379,26 +388,27 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: PrimitiveQuery, - ) -> Result - { + ) -> Result { let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let mut operation_children = Vec::new(); let start = number_phrases + (number_phrases == 0) as usize; for len in start..=query.len() { let mut word_count = len - number_phrases; - let query: Vec<_> = query.iter().filter(|p| { - if p.is_phrase() { - true - } else if word_count != 0 { - word_count -= 1; - true - } else { - false - } - }) - .cloned() - .collect(); + let query: Vec<_> = query + .iter() + .filter(|p| { + if p.is_phrase() { + true + } else if word_count != 0 { + word_count -= 1; + true + } else { + false + } + }) + .cloned() + .collect(); let ngrams = ngrams(ctx, authorize_typos, &query)?; operation_children.push(ngrams); @@ -434,7 +444,11 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream, stop_words: Option>, words_limit: Option) -> PrimitiveQuery { +fn create_primitive_query( + query: TokenStream, + stop_words: Option>, + words_limit: Option, +) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -444,23 +458,29 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo let mut peekable = query.peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { return primitive_query } + if primitive_query.len() >= parts_limit { + return primitive_query; + } match token.kind { - TokenKind::Word | TokenKind::StopWord => { + TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { phrase.push(token.word.to_string()); } else if peekable.peek().is_some() { - if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); - } + if !stop_words + .as_ref() + .map_or(false, |swords| swords.contains(token.word.as_ref())) + { + primitive_query + .push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + } } else { primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); } - }, + } TokenKind::Separator(separator_kind) => { let quote_count = token.word.chars().filter(|&s| s == '"').count(); // swap quoted state if we encounter a double quote @@ -468,10 +488,11 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); } - }, + } _ => (), } } @@ -486,7 +507,7 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo /// Returns the maximum number of typos that this Operation allows. pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Phrase}; + use Operation::{And, Or, Phrase, Query}; match operation { Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), And(ops) => ops.iter().map(maximum_typo).sum::(), @@ -498,13 +519,12 @@ pub fn maximum_typo(operation: &Operation) -> usize { /// Returns the maximum proximity that this Operation allows. pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Phrase}; + use Operation::{And, Or, Phrase, Query}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), And(ops) => { - ops.iter().map(maximum_proximity).sum::() - + ops.len().saturating_sub(1) * 7 - }, + ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 + } Query(_) | Phrase(_) => 0, } } @@ -515,7 +535,8 @@ mod test { use maplit::hashmap; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; - use rand::{Rng, SeedableRng, rngs::StdRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use super::*; @@ -532,11 +553,11 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> Result> - { + ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { - let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; + let qt = + create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; Ok(Some((qt, primitive_query))) } else { Ok(None) @@ -571,7 +592,7 @@ mod test { } TestContext { - synonyms: hashmap!{ + synonyms: hashmap! { vec![String::from("hello")] => vec![ vec![String::from("hi")], vec![String::from("good"), String::from("morning")], @@ -594,7 +615,7 @@ mod test { vec![String::from("new"), String::from("york")], ], }, - postings: hashmap!{ + postings: hashmap! { String::from("hello") => random_postings(rng, 1500), String::from("hi") => random_postings(rng, 4000), String::from("word") => random_postings(rng, 2500), @@ -620,15 +641,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "friends".to_string()) }), - ]), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(2, "heyfriends".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -640,15 +674,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friends".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "heyfriends".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -660,26 +707,60 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hi".to_string()) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("morning".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "hello".to_string()) }), + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hi".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("good".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("morning".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "hello".to_string()), + }), + ], + ), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("earth".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nature".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ], + ), ]), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("earth".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nature".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), - ]); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "helloworld".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -691,40 +772,95 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "yorkcity".to_string()) }), - ]), - ]), - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "newyork".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), + let expected = Operation::Or( + false, + vec![ Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "yorkcity".to_string()), + }), + ], + ), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "newyorkcity".to_string()) }), - ]), - ]); + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nyc".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "newyork".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nyc".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "newyorkcity".to_string()), + }), + ], + ), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -736,15 +872,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("n".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "grams".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("n".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "grams".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "ngrams".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -756,21 +905,34 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Phrase(vec![ - "word".to_string(), - "split".to_string(), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Phrase(vec!["word".to_string(), "split".to_string()]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "wordsplit".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("fish".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("fish".to_string()) }) - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), - ]); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "wordsplitfish".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -783,14 +945,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "friends".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -803,17 +963,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "friends".to_string(), - ]), - Operation::Phrase(vec![ - "wooop".to_string(), - "wooop".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), + Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]), ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -825,34 +980,80 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(true, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), - ]), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ + let expected = Operation::Or( + true, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "myfriend".to_string()) }) - ]) - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), - ]), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "heymy".to_string()), + }), + ], + ), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friend".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "myfriend".to_string()), + }), + ], + ), + ]), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "heymy".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friend".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "heymyfriend".to_string()), + }), + ], + ), + ], + ); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -864,11 +1065,9 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Phrase(vec![ - "hey".to_string(), - "my".to_string(), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -880,29 +1079,66 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(true, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "mygood".to_string()) }), + let expected = Operation::Or( + true, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), + ]), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("good".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "mygood".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), + ]), + ], + ); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -914,14 +1150,27 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("heyfriends".to_string()), + }), + ], + ); + let (query_tree, _) = + TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -934,14 +1183,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "my".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]); - let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 34ff743f0..9e3fce75d 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -1,6 +1,7 @@ use std::iter::{Chain, FromIterator}; use std::ops::RangeInclusive; -use roaring::bitmap::{RoaringBitmap, IntoIter}; + +use roaring::bitmap::{IntoIter, RoaringBitmap}; pub struct AvailableDocumentsIds { iter: Chain>, @@ -18,16 +19,12 @@ impl AvailableDocumentsIds { None => 1..=0, // empty range iterator }; - AvailableDocumentsIds { - iter: available.into_iter().chain(iter), - } - }, + AvailableDocumentsIds { iter: available.into_iter().chain(iter) } + } None => { let empty = RoaringBitmap::new().into_iter(); - AvailableDocumentsIds { - iter: empty.chain(0..=u32::max_value()), - } - }, + AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) } + } } } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6e26bf027..42dd55443 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; +use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - update_id: u64 + update_id: u64, ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments { wtxn, index, _update_id: update_id } } @@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { mod tests { use heed::EnvOpenOptions; - use crate::update::{IndexDocuments, UpdateFormat}; use super::*; + use crate::update::{IndexDocuments, UpdateFormat}; #[test] fn clear_documents() { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 7fc7e5d77..dfb48dc58 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,5 +1,5 @@ -use std::collections::HashMap; use std::collections::hash_map::Entry; +use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; @@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; -use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; +use super::ClearDocuments; +use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; -use super::ClearDocuments; +use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> Result> - { - let external_documents_ids = index - .external_documents_ids(wtxn)? - .into_static(); + ) -> Result> { + let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); Ok(DeleteDocuments { wtxn, @@ -84,12 +81,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { key: Some(main_key::PRIMARY_KEY_KEY), } })?; - let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { - FieldIdMapMissingEntry::FieldName { + let id_field = + fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: primary_key.to_string(), process: "DeleteDocuments::execute", - } - })?; + })?; let Index { env: _env, @@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), Value::Number(number) => SmallString32::from(number.to_string()), - document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), + document_id => { + return Err(UserError::InvalidDocumentId { document_id }.into()) + } }; external_ids.push(external_id); } @@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { match entry.get().checked_sub(count_diff) { Some(0) | None => entry.remove(), - Some(count) => entry.insert(count) + Some(count) => entry.insert(count), }; } } @@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = words.iter().filter_map(|(word, must_remove)| { - if *must_remove { Some(word.as_ref()) } else { None } - }); + let words_to_delete = + words.iter().filter_map( + |(word, must_remove)| { + if *must_remove { + Some(word.as_ref()) + } else { + None + } + }, + ); let words_to_delete = fst::Set::from_iter(words_to_delete)?; let new_words_fst = { @@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. - let mut iter = word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; + let mut iter = + word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word level position docids. - let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = + word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word prefix level position docids. - let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = + word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>, + C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, F: Fn(K) -> DocumentId, { - let mut iter = db.remap_key_type::() - .prefix_iter_mut(wtxn, &[field_id])? - .remap_key_type::(); + let mut iter = + db.remap_key_type::().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::(); while let Some(result) = iter.next() { let (key, ()) = result?; @@ -441,8 +448,8 @@ where mod tests { use heed::EnvOpenOptions; - use crate::update::{IndexDocuments, UpdateFormat}; use super::*; + use crate::update::{IndexDocuments, UpdateFormat}; #[test] fn delete_documents_with_numbers_as_primary_key() { diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 757cbe810..09f962bbc 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -3,17 +3,18 @@ use std::fs::File; use std::num::NonZeroUsize; use chrono::Utc; -use grenad::{CompressionType, Reader, Writer, FileFuse}; +use grenad::{CompressionType, FileFuse, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::update::index_documents::{ + create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, +}; use crate::{Index, Result}; pub struct Facets<'t, 'u, 'i> { @@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> Facets<'t, 'u, 'i> - { + ) -> Facets<'t, 'u, 'i> { Facets { wtxn, index, @@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; // Clear the facet number levels. - clear_field_number_levels( - self.wtxn, - self.index.facet_id_f64_docids, - field_id, - )?; + clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. let number_documents_ids = compute_faceted_documents_ids( @@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; + self.index.put_string_faceted_documents_ids( + self.wtxn, + field_id, + &string_documents_ids, + )?; + self.index.put_number_faceted_documents_ids( + self.wtxn, + field_id, + &number_documents_ids, + )?; write_into_lmdb_database( self.wtxn, @@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } -fn clear_field_number_levels<'t, >( +fn clear_field_number_levels<'t>( wtxn: &'t mut heed::RwTxn, db: heed::Database, field_id: u8, -) -> heed::Result<()> -{ +) -> heed::Result<()> { let left = (field_id, 1, f64::MIN, f64::MIN); let right = (field_id, u8::MAX, f64::MAX, f64::MAX); let range = left..=right; @@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>( level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: u8, -) -> Result> -{ +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &[field_id])? @@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>( // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(compression_type, compression_level, file) - })?; + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; let level_0_range = { let left = (field_id, 0, f64::MIN, f64::MIN); @@ -196,8 +197,7 @@ fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, -) -> Result -{ +) -> Result { let mut documents_ids = RoaringBitmap::new(); for result in db.prefix_iter(rtxn, &[field_id])? { @@ -215,8 +215,7 @@ fn write_number_entry( left: f64, right: f64, ids: &RoaringBitmap, -) -> Result<()> -{ +) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 51c8b948a..05242f540 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; -use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; +use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; use std::num::{NonZeroU32, NonZeroUsize}; use std::result::Result as StdResult; use std::str; @@ -10,28 +10,26 @@ use std::time::Instant; use bstr::ByteSlice as _; use chrono::Utc; -use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; +use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; use heed::types::ByteSlice; -use log::{debug, info, error}; +use log::{debug, error, info}; use memmap::Mmap; use rayon::prelude::*; use rayon::ThreadPool; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; -use crate::error::{Error, InternalError}; -use crate::{Index, Result}; -use crate::update::{ - Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, - WordPrefixPairProximityDocids, -}; -use self::store::{Store, Readers}; pub use self::merge_function::{ - fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first + cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, }; +use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; - -use crate::MergeFn; use super::UpdateBuilder; +use crate::error::{Error, InternalError}; +use crate::update::{ + Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, + WordsLevelPositions, WordsPrefixesFst, +}; +use crate::{Index, MergeFn, Result}; mod merge_function; mod store; @@ -48,7 +46,11 @@ pub enum WriteMethod { GetMergePut, } -pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { +pub fn create_writer( + typ: CompressionType, + level: Option, + file: File, +) -> io::Result> { let mut builder = Writer::builder(); builder.compression_type(typ); if let Some(level) = level { @@ -64,8 +66,7 @@ pub fn create_sorter( chunk_fusing_shrink_size: Option, max_nb_chunks: Option, max_memory: Option, -) -> Sorter> -{ +) -> Sorter> { let mut builder = Sorter::builder(merge); if let Some(shrink_size) = chunk_fusing_shrink_size { builder.file_fusing_shrink_size(shrink_size); @@ -83,7 +84,10 @@ pub fn create_sorter( builder.build() } -pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Result> { +pub fn writer_into_reader( + writer: Writer, + shrink_size: Option, +) -> Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Res pub fn merge_readers( sources: Vec>, merge: MergeFn, -) -> Merger> -{ +) -> Merger> { let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() @@ -118,13 +121,7 @@ where let before = Instant::now(); let merger = merge_readers(sources, merge); - merger_iter_into_lmdb_database( - wtxn, - database, - merger.into_merge_iter()?, - merge, - method, - )?; + merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; debug!("MTBL stores merged in {:.02?}!", before.elapsed()); Ok(()) @@ -149,7 +146,7 @@ where while let Some((k, v)) = reader.next()? { out_iter.append(k, v)?; } - }, + } WriteMethod::GetMergePut => { while let Some((k, v)) = reader.next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; @@ -158,11 +155,11 @@ where let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let val = merge(k, &vals)?; iter.put_current(k, &val)?; - }, + } _ => { drop(iter); database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - }, + } } } } @@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database( ) -> Result<()> where Error: From, - Error: From> + Error: From>, { debug!("Writing MTBL sorter..."); let before = Instant::now(); - merger_iter_into_lmdb_database( - wtxn, - database, - sorter.into_iter()?, - merge, - method, - )?; + merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) @@ -214,7 +205,7 @@ where while let Some((k, v)) = sorter.next()? { out_iter.append(k, v)?; } - }, + } WriteMethod::GetMergePut => { while let Some((k, v)) = sorter.next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; @@ -226,14 +217,14 @@ where InternalError::IndexingMergingKeys { process: "get-put-merge" } })?; iter.put_current(k, &val)?; - }, + } _ => { drop(iter); database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - }, + } } } - }, + } } Ok(()) @@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // Early return when there is no document to add if reader.buffer().is_empty() { - return Ok(DocumentAdditionResult { - nb_documents: 0, - }) + return Ok(DocumentAdditionResult { nb_documents: 0 }); } self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let output = match self.update_format { UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, - UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?, + UpdateFormat::JsonStream => { + transform.output_from_json_stream(reader, &progress_callback)? + } }; let nb_documents = output.documents_count; @@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where - F: Fn(UpdateIndexingStep) + Sync + F: Fn(UpdateIndexingStep) + Sync, { let before_indexing = Instant::now(); @@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // settings if none have already been set. backup_pool = rayon::ThreadPoolBuilder::new().build()?; &backup_pool - }, + } }; let readers = pool.install(|| { @@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut documents_ids = self.index.documents_ids(self.wtxn)?; let contains_documents = !documents_ids.is_empty(); - let write_method = if contains_documents { - WriteMethod::GetMergePut - } else { - WriteMethod::Append - }; + let write_method = + if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; debug!("Writing using the write method: {:?}", write_method); @@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { *self.index.docid_word_positions.as_polymorph(), docid_word_positions_readers, keep_first, - write_method + write_method, )?; database_count += 1; @@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { *self.index.documents.as_polymorph(), documents_readers, keep_first, - write_method + write_method, )?; database_count += 1; @@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { fst_merge, WriteMethod::GetMergePut, )?; - }, + } DatabaseType::WordDocids => { debug!("Writing the words docids into LMDB on disk..."); let db = *self.index.word_docids.as_polymorph(); @@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::FacetLevel0NumbersDocids => { debug!("Writing the facet numbers docids into LMDB on disk..."); let db = *self.index.facet_id_f64_docids.as_polymorph(); @@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { cbo_roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::FieldIdWordCountDocids => { debug!("Writing the field id word count docids into LMDB on disk..."); let db = *self.index.field_id_word_count_docids.as_polymorph(); @@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { cbo_roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::WordLevel0PositionDocids => { debug!("Writing the word level 0 positions docids into LMDB on disk..."); let db = *self.index.word_level_position_docids.as_polymorph(); @@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { #[cfg(test)] mod tests { - use super::*; use heed::EnvOpenOptions; + use super::*; + #[test] fn simple_document_replacement() { let path = tempfile::tempdir().unwrap(); @@ -1053,9 +1042,8 @@ mod tests { assert_eq!(count, 3); let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); - let (kevin_id, _) = docs.iter().find(|(_, d)| { - d.get(0).unwrap() == br#""updated kevin""# - }).unwrap(); + let (kevin_id, _) = + docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); let (id, doc) = docs[*kevin_id as usize]; assert_eq!(id, *kevin_id); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 94ae12108..7318c5bd0 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -8,25 +8,29 @@ use std::{cmp, iter}; use bstr::ByteSlice as _; use fst::Set; -use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; +use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; +use super::merge_function::{ + cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, +}; +use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, +}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; - -use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; +use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; @@ -56,7 +60,8 @@ pub struct Store<'s, A> { word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, - words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, + words_pairs_proximities_docids: + LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, @@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, stop_words: Option<&'s Set>, - ) -> Result - { + ) -> Result { // We divide the max memory by the number of sorter the Store have. let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); @@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Some(1024 * 1024 * 1024), // 1MB ); - let documents_writer = tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; - let docid_word_positions_writer = tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; + let documents_writer = tempfile() + .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; + let docid_word_positions_writer = tempfile() + .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; let mut config = AnalyzerConfig::default(); if let Some(stop_words) = stop_words { @@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.word_docids.get_refresh(word.as_bytes()) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { let word_vec = SmallVec32::from(word.as_bytes()); // A newly inserted element is append at the end of the linked hash map. @@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: OrderedFloat, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_numbers_sorter; Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_number_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); @@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: String, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); @@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // Save the documents ids under the words pairs proximities that it contains. fn insert_words_pairs_proximities_docids<'a>( &mut self, - words_pairs_proximities: impl IntoIterator, + words_pairs_proximities: impl IntoIterator, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { for ((w1, w2), prox) in words_pairs_proximities { let w1 = SmallVec32::from(w1.as_bytes()); let w2 = SmallVec32::from(w2.as_bytes()); @@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // if get_refresh finds the element it is assured // to be at the end of the linked hash map. match self.words_pairs_proximities_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. let ids = RoaringBitmap::from_iter(Some(id)); @@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // Removing front elements is equivalent to removing the LRUs. let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); iter.take(overflow).for_each(|x| lrus.push(x)); - Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?; + Self::write_words_pairs_proximities( + &mut self.words_pairs_proximities_docids_sorter, + lrus, + )?; } Ok(()) @@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { facet_numbers_values: &mut HashMap>, facet_strings_values: &mut HashMap>, record: &[u8], - ) -> Result<()> - { + ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. let words_pair_proximities = compute_words_pair_proximities(&words_positions); self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; @@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } self.documents_writer.insert(document_id.to_be_bytes(), record)?; - Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; - Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; + Self::write_docid_word_positions( + &mut self.docid_word_positions_writer, + document_id, + words_positions, + )?; + Self::write_word_position_docids( + &mut self.word_level_position_docids_sorter, + document_id, + words_positions, + )?; words_positions.clear(); @@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_words_pairs_proximities( sorter: &mut Sorter>, - iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, + iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, ) -> Result<()> where Error: From, @@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { writer: &mut Writer, id: DocumentId, words_positions: &HashMap>, - ) -> Result<()> - { + ) -> Result<()> { // We prefix the words by the document id. let mut key = id.to_be_bytes().to_vec(); let mut buffer = Vec::new(); @@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_string_docids( - sorter: &mut Sorter>, - iter: I, - ) -> Result<()> + fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); @@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_number_docids( - sorter: &mut Sorter>, - iter: I, - ) -> Result<()> + fn write_facet_field_number_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator), RoaringBitmap)>, + I: IntoIterator), RoaringBitmap)>, Error: From, { let mut data_buffer = Vec::new(); @@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, RoaringBitmap)>, + I: IntoIterator, RoaringBitmap)>, Error: From, { let mut key = Vec::new(); @@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { log_every_n: Option, mut progress_callback: F, ) -> Result - where F: FnMut(UpdateIndexingStep), + where + F: FnMut(UpdateIndexingStep), { debug!("{:?}: Indexing in a Store...", thread_index); @@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if count % num_threads == thread_index { // This is a log routine that we do every `log_every_n` documents. if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { - info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); + info!( + "We have seen {} documents so far ({:.02?}).", + format_count(count), + before.elapsed() + ); progress_callback(UpdateIndexingStep::IndexDocuments { documents_seen: count, total_documents: documents_count, @@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } for (attr, content) in document.iter() { - if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { - let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; + if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) + { + let value = + serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; let (facet_numbers, facet_strings) = extract_facet_values(&value); - facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); - facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); + facet_numbers_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_numbers); + facet_strings_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_strings); if self.searchable_fields.contains(&attr) { let content = match json_to_string(&value) { @@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { last_pos = Some(pos); let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); + words_positions + .entry(token.text().to_string()) + .or_insert_with(SmallVec32::new) + .push(position); } if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { let key = (attr, last_pos as u8 + 1); - self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); + self.field_id_word_count_docids + .entry(key) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); } } } @@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.facet_field_string_docids, )?; - let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut word_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut builder = fst::SetBuilder::memory(); let mut iter = self.word_docids_sorter.into_iter()?; @@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.main_sorter.write_into(&mut main_wtr)?; - let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut words_pairs_proximities_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.words_pairs_proximities_docids_sorter + .write_into(&mut words_pairs_proximities_docids_wtr)?; - let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut word_level_position_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut field_id_word_count_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; - let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut facet_field_numbers_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut facet_field_strings_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; - let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; + let mut field_id_docid_facet_numbers_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_numbers_sorter + .write_into(&mut field_id_docid_facet_numbers_wtr)?; - let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; + let mut field_id_docid_facet_strings_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_strings_sorter + .write_into(&mut field_id_docid_facet_strings_wtr)?; let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; - let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; - let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; - let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; - let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; - let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; - let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; - let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; + let words_pairs_proximities_docids = + writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let word_level_position_docids = + writer_into_reader(word_level_position_docids_wtr, shrink_size)?; + let field_id_word_count_docids = + writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; + let facet_field_numbers_docids = + writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; + let facet_field_strings_docids = + writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; + let field_id_docid_facet_numbers = + writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; + let field_id_docid_facet_strings = + writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; + let docid_word_positions = + writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; Ok(Readers { @@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { /// close to each other. fn compute_words_pair_proximities( word_positions: &HashMap>, -) -> HashMap<(&str, &str), u8> -{ +) -> HashMap<(&str, &str), u8> { use itertools::Itertools; let mut words_pair_proximities = HashMap::new(); @@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool { /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// else we keep the standart proximity of 1 between words. -fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator().is_some()) .scan((0, None), |(offset, prev_kind), token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { - *prev_kind = Some(token.kind); - } - _ => (), + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } Some((*offset, token)) }) - .filter(|(_, t)| t.is_word()) + .filter(|(_, t)| t.is_word()) } fn extract_facet_values(value: &Value) -> (Vec, Vec) { @@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec) { match value { Value::Null => (), Value::Bool(b) => output_strings.push(b.to_string()), - Value::Number(number) => if let Some(float) = number.as_f64() { - output_numbers.push(float); - }, + Value::Number(number) => { + if let Some(float) = number.as_f64() { + output_numbers.push(float); + } + } Value::String(string) => { let string = string.trim().to_lowercase(); output_strings.push(string); - }, - Value::Array(values) => if can_recurse { - for value in values { - inner_extract_facet_values(value, false, output_numbers, output_strings); + } + Value::Array(values) => { + if can_recurse { + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } } - }, + } Value::Object(_) => (), } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 9e88559d0..756ff492e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -10,14 +10,15 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::error::{Error, UserError, InternalError}; -use crate::index::db_name; -use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; -use crate::{Index, Result}; use super::merge_function::merge_two_obkvs; -use super::{create_writer, create_sorter, IndexDocumentsMethod}; +use super::{create_sorter, create_writer, IndexDocumentsMethod}; +use crate::error::{Error, InternalError, UserError}; +use crate::index::db_name; +use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{ + ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, +}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -64,7 +65,11 @@ impl Transform<'_, '_> { self.output_from_generic_json(reader, false, progress_callback) } - pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> Result + pub fn output_from_json_stream( + self, + reader: R, + progress_callback: F, + ) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -86,14 +91,16 @@ impl Transform<'_, '_> { let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); // Deserialize the whole batch of documents in memory. - let mut documents: Peekable>>>> = if is_stream { + let mut documents: Peekable< + Box>>>, + > = if is_stream { let iter = serde_json::Deserializer::from_reader(reader).into_iter(); - let iter = Box::new(iter) as Box>; + let iter = Box::new(iter) as Box>; iter.peekable() } else { let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; let iter = vec.into_iter().map(Ok); - let iter = Box::new(iter) as Box>; + let iter = Box::new(iter) as Box>; iter.peekable() }; @@ -104,15 +111,16 @@ impl Transform<'_, '_> { Err(_) => { let error = documents.next().unwrap().unwrap_err(); return Err(UserError::SerdeJson(error).into()); - }, + } }; - let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); + let alternative_name = + first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); let (primary_key_id, primary_key) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, - self.autogenerate_docids + self.autogenerate_docids, )?; if documents.peek().is_none() { @@ -173,9 +181,11 @@ impl Transform<'_, '_> { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), - content => return Err(UserError::InvalidDocumentId { - document_id: content.clone(), - }.into()), + content => { + return Err( + UserError::InvalidDocumentId { document_id: content.clone() }.into() + ) + } }, None => { if !self.autogenerate_docids { @@ -183,7 +193,7 @@ impl Transform<'_, '_> { } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) - }, + } }; // We iterate in the fields ids ordered. @@ -194,7 +204,8 @@ impl Transform<'_, '_> { // and this should be the document id we return the one we generated. if let Some(value) = document.get(name) { // We serialize the attribute values. - serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; + serde_json::to_writer(&mut json_buffer, value) + .map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } @@ -202,7 +213,8 @@ impl Transform<'_, '_> { if field_id == primary_key_id && validate_document_id(&external_id).is_none() { return Err(UserError::InvalidDocumentId { document_id: Value::from(external_id), - }.into()); + } + .into()); } } @@ -248,9 +260,9 @@ impl Transform<'_, '_> { // Extract the position of the primary key in the current headers, None if not found. let primary_key_pos = match self.index.primary_key(self.rtxn)? { Some(primary_key) => { - // The primary key is known so we must find the position in the CSV headers. - headers.iter().position(|h| h == primary_key) - }, + // The primary key is known so we must find the position in the CSV headers. + headers.iter().position(|h| h == primary_key) + } None => headers.iter().position(is_primary_key), }; @@ -261,7 +273,7 @@ impl Transform<'_, '_> { self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, - self.autogenerate_docids + self.autogenerate_docids, )?; // The primary key field is not present in the header, so we need to create it. @@ -308,27 +320,30 @@ impl Transform<'_, '_> { // We validate the document id [a-zA-Z0-9\-_]. match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(UserError::InvalidDocumentId { - document_id: Value::from(external_id), - }.into()), + None => { + return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + } + .into()) + } } - }, + } None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), }; // When the primary_key_field_id is found in the fields ids list // we return the generated document id instead of the record field. - let iter = fields_ids.iter() - .map(|(fi, i)| { - let field = if *fi == primary_key_id { external_id } else { &record[*i] }; - (fi, field) - }); + let iter = fields_ids.iter().map(|(fi, i)| { + let field = if *fi == primary_key_id { external_id } else { &record[*i] }; + (fi, field) + }); // We retrieve the field id based on the fields ids map fields ids order. for (field_id, field) in iter { // We serialize the attribute values as JSON strings. json_buffer.clear(); - serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; + serde_json::to_writer(&mut json_buffer, &field) + .map_err(InternalError::SerdeJson)?; writer.insert(*field_id, &json_buffer)?; } @@ -410,26 +425,27 @@ impl Transform<'_, '_> { IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::UpdateDocuments => { let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)? - .ok_or(InternalError::DatabaseMissingEntry { + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, - })?; + }, + )?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) } } - }, + } None => { // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. - let new_docid = available_documents_ids.next() - .ok_or(UserError::DocumentLimitReached)?; + let new_docid = + available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) - }, + } }; // We insert the document under the documents ids map into the final file. @@ -450,7 +466,8 @@ impl Transform<'_, '_> { // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = + create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; // Once we have written all the documents into the final sorter, we write the documents // into this writer, extract the file and reset the seek to be able to read it again. @@ -485,8 +502,7 @@ impl Transform<'_, '_> { primary_key: String, old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, - ) -> Result - { + ) -> Result { let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; @@ -494,7 +510,8 @@ impl Transform<'_, '_> { // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = + create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; let mut obkv_buffer = Vec::new(); for result in self.index.documents.iter(self.rtxn)? { @@ -561,20 +578,19 @@ fn compute_primary_key_pair( return Err(UserError::MissingPrimaryKey.into()); } DEFAULT_PRIMARY_KEY_NAME.to_string() - }, + } }; let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; Ok((id, name)) - }, + } } } fn validate_document_id(document_id: &str) -> Option<&str> { let document_id = document_id.trim(); Some(document_id).filter(|id| { - !id.is_empty() && id.chars().all(|c| { - matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_') - }) + !id.is_empty() + && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) }) } @@ -583,8 +599,7 @@ mod test { use super::*; mod compute_primary_key { - use super::compute_primary_key_pair; - use super::FieldsIdsMap; + use super::{compute_primary_key_pair, FieldsIdsMap}; #[test] fn should_return_primary_key_if_is_some() { @@ -594,7 +609,8 @@ mod test { Some("toto"), &mut fields_map, Some("tata".to_string()), - false); + false, + ); assert_eq!(result.unwrap(), (0u8, "toto".to_string())); assert_eq!(fields_map.len(), 1); } @@ -602,11 +618,8 @@ mod test { #[test] fn should_return_alternative_if_primary_is_none() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - Some("tata".to_string()), - false); + let result = + compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); assert_eq!(result.unwrap(), (0u8, "tata".to_string())); assert_eq!(fields_map.len(), 1); } @@ -614,23 +627,15 @@ mod test { #[test] fn should_return_default_if_both_are_none() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - None, - true); + let result = compute_primary_key_pair(None, &mut fields_map, None, true); assert_eq!(result.unwrap(), (0u8, "id".to_string())); assert_eq!(fields_map.len(), 1); } #[test] - fn should_return_err_if_both_are_none_and_recompute_is_false(){ + fn should_return_err_if_both_are_none_and_recompute_is_false() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - None, - false); + let result = compute_primary_key_pair(None, &mut fields_map, None, false); assert!(result.is_err()); assert_eq!(fields_map.len(), 0); } diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 203937e2f..36ed7d8fa 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::facets::Facets; -pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::index_documents::{ + DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat, +}; pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8f4fe48c9..c6540b33a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -34,17 +34,24 @@ impl Setting { } impl Serialize for Setting { - fn serialize(&self, serializer: S) -> StdResult where S: Serializer { + fn serialize(&self, serializer: S) -> StdResult + where + S: Serializer, + { match self { Self::Set(value) => Some(value), // Usually not_set isn't serialized by setting skip_serializing_if field attribute Self::NotSet | Self::Reset => None, - }.serialize(serializer) + } + .serialize(serializer) } } impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { - fn deserialize(deserializer: D) -> StdResult where D: Deserializer<'de> { + fn deserialize(deserializer: D) -> StdResult + where + D: Deserializer<'de>, + { Deserialize::deserialize(deserializer).map(|x| match x { Some(x) => Self::Set(x), None => Self::Reset, // Reset is forced by sending null value @@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.stop_words = if stop_words.is_empty() { - Setting::Reset - } else { - Setting::Set(stop_words) - } + self.stop_words = + if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) } } pub fn reset_distinct_field(&mut self) { @@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } pub fn set_synonyms(&mut self, synonyms: HashMap>) { - self.synonyms = if synonyms.is_empty() { - Setting::Reset - } else { - Setting::Set(synonyms) - } + self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } } pub fn reset_primary_key(&mut self) { @@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync + F: Fn(UpdateIndexingStep, u64) + Sync, { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let update_id = self.update_id; @@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // There already has been a document addition, the primary key should be set by now. - let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; + let primary_key = + self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( @@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account - let names: Vec<_> = fields - .iter() - .unique() - .map(String::as_str) - .collect(); + let names: Vec<_> = fields.iter().unique().map(String::as_str).collect(); for name in names.iter() { - fields_ids_map - .insert(name) - .ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_displayed_fields(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - fields_ids_map - .insert(attr) - .ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?; self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, + Setting::Reset => { + self.index.delete_distinct_field(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let mut new_fields_ids_map = FieldsIdsMap::new(); // fields are deduplicated, only the first occurrence is taken into account - let names = fields - .iter() - .unique() - .map(String::as_str) - .collect::>(); + let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map - .insert(&name) - .ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map - .insert(&name) - .ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_searchable_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; } - Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_searchable_fields(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let fst = fst::Set::from_iter(stop_words)?; // Does the new FST differ from the previous one? - if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { + if current + .map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) + { // we want to re-create our FST. self.index.put_stop_words(self.wtxn, &fst)?; Ok(true) @@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { analyzer .analyze(text) .tokens() - .filter_map(|token| - if token.is_word() { Some(token.text().to_string()) } else { None } - ) + .filter_map(|token| { + if token.is_word() { + Some(token.text().to_string()) + } else { + None + } + }) .collect::>() } @@ -360,25 +357,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for (word, synonyms) in synonyms { // Normalize both the word and associated synonyms. let normalized_word = normalize(&analyzer, word); - let normalized_synonyms = synonyms - .iter() - .map(|synonym| normalize(&analyzer, synonym)); + let normalized_synonyms = + synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); // Store the normalized synonyms under the normalized word, // merging the possible duplicate words. - let entry = new_synonyms - .entry(normalized_word) - .or_insert_with(Vec::new); + let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); entry.extend(normalized_synonyms); } // Make sure that we don't have duplicate synonyms. - new_synonyms - .iter_mut() - .for_each(|(_, synonyms)| { - synonyms.sort_unstable(); - synonyms.dedup(); - }); + new_synonyms.iter_mut().for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); let old_synonyms = self.index.synonyms(self.wtxn)?; @@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_filterable_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_filterable_fields(self.wtxn)?; + } Setting::NotSet => (), } Ok(()) @@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_criteria(self.wtxn, &new_criteria)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } + Setting::Reset => { + self.index.delete_criteria(self.wtxn)?; + } Setting::NotSet => (), } Ok(()) @@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } else { Err(UserError::PrimaryKeyCannotBeChanged.into()) } - }, + } Setting::Reset => { if self.index.number_of_documents(&self.wtxn)? == 0 { self.index.delete_primary_key(self.wtxn)?; @@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } else { Err(UserError::PrimaryKeyCannotBeReset.into()) } - }, + } Setting::NotSet => Ok(()), } } pub fn execute(mut self, progress_callback: F) -> Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync, { self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { - use heed::EnvOpenOptions; - use heed::types::ByteSlice; - use maplit::{btreeset, hashmap, hashset}; use big_s::S; + use heed::types::ByteSlice; + use heed::EnvOpenOptions; + use maplit::{btreeset, hashmap, hashset}; + use super::*; use crate::error::Error; use crate::update::{IndexDocuments, UpdateFormat}; use crate::{Criterion, FilterCondition, SearchResult}; - use super::*; - #[test] fn set_and_reset_searchable_fields() { let path = tempfile::tempdir().unwrap(); @@ -674,7 +669,7 @@ mod tests { // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ S("age") }); + builder.set_filterable_fields(hashset! { S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -692,12 +687,15 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset!{ S("age") }); + assert_eq!(fields_ids, hashset! { S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. - let count = index.facet_id_f64_docids + let count = index + .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + .prefix_iter(&rtxn, &[0, 0]) + .unwrap() + .count(); assert_eq!(count, 3); drop(rtxn); @@ -718,9 +716,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. - let count = index.facet_id_f64_docids + let count = index + .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + .prefix_iter(&rtxn, &[0, 0]) + .unwrap() + .count(); assert_eq!(count, 4); } @@ -969,7 +970,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_filterable_fields(hashset!{ S("age"), S("toto") }); + builder.set_filterable_fields(hashset! { S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 1d0e776b1..2816ebca0 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -1,8 +1,8 @@ use grenad::CompressionType; use rayon::ThreadPool; +use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings}; use crate::{Index, Result}; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> ClearDocuments<'t, 'u, 'i> - { + ) -> ClearDocuments<'t, 'u, 'i> { ClearDocuments::new(wtxn, index, self.update_id) } @@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Result> - { + ) -> Result> { DeleteDocuments::new(wtxn, index, self.update_id) } @@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> IndexDocuments<'t, 'u, 'i, 'a> - { + ) -> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = IndexDocuments::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; @@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Settings<'a, 't, 'u, 'i> - { + ) -> Settings<'a, 't, 'u, 'i> { let mut builder = Settings::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; @@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Facets<'t, 'u, 'i> - { + ) -> Facets<'t, 'u, 'i> { let mut builder = Facets::new(wtxn, index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a2197b28c..ffc359719 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,15 +1,13 @@ use std::str; -use crate::Index; use fst::Streamer; use grenad::CompressionType; use heed::types::ByteSlice; -use crate::Result; -use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ - create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, + create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, }; +use crate::{Index, Result}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { } impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, index, diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 9019b26e5..9b876321e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,18 +1,17 @@ use std::str; use fst::automaton::{Automaton, Str}; -use fst::{Streamer, IntoStreamer}; +use fst::{IntoStreamer, Streamer}; use grenad::CompressionType; -use heed::BytesEncode; use heed::types::ByteSlice; +use heed::BytesEncode; use log::debug; -use crate::{Index, Result}; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - WriteMethod, create_sorter, sorter_into_lmdb_database, - cbo_roaring_bitmap_merge, + cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, }; +use crate::{Index, Result}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> - { + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { WordPrefixPairProximityDocids { wtxn, index, diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index e2e3f7b4c..d43cd19b8 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,25 +1,23 @@ -use std::{cmp, str}; use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroU32; +use std::{cmp, str}; use fst::automaton::{self, Automaton}; -use fst::{Streamer, IntoStreamer}; -use grenad::{CompressionType, Reader, Writer, FileFuse}; +use fst::{IntoStreamer, Streamer}; +use grenad::{CompressionType, FileFuse, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; -use crate::Result; -use crate::update::index_documents::WriteMethod; +use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; use crate::update::index_documents::{ - create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, - cbo_roaring_bitmap_merge, sorter_into_lmdb_database + cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, + write_into_lmdb_database, writer_into_reader, WriteMethod, }; -use crate::{Index, TreeLevel}; +use crate::{Index, Result, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { } impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsLevelPositions<'t, 'u, 'i> { WordsLevelPositions { wtxn, index, @@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) + }, WriteMethod::Append, )?; @@ -176,13 +179,11 @@ fn compute_positions_levels( shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> Result> -{ +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(compression_type, compression_level, file) - })?; + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; for result in words_db.iter(rtxn)? { let (word, ()) = result?; @@ -193,7 +194,8 @@ fn compute_positions_levels( left..=right }; - let first_level_size = words_positions_db.remap_data_type::() + let first_level_size = words_positions_db + .remap_data_type::() .range(rtxn, &level_0_range)? .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; @@ -253,8 +255,7 @@ fn write_level_entry( left: u32, right: u32, ids: &RoaringBitmap, -) -> Result<()> -{ +) -> Result<()> { let key = (word, level, left, right); let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index d1aa267b8..f35dea10d 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -2,7 +2,8 @@ use std::iter::FromIterator; use std::str; use fst::Streamer; -use crate::{Index, SmallString32, Result}; + +use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> WordsPrefixesFst<'t, 'u, 'i> - { + ) -> WordsPrefixesFst<'t, 'u, 'i> { WordsPrefixesFst { wtxn, index, @@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); for n in 1..=self.max_prefix_length { - let mut current_prefix = SmallString32::new(); let mut current_prefix_count = 0; let mut builder = fst::SetBuilder::memory(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 8c63e5e08..7842b6c13 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,9 +1,8 @@ -use milli::{Criterion, Index, DocumentId}; -use milli::update::{IndexDocuments, UpdateFormat, Settings}; - use big_s::S; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; +use milli::update::{IndexDocuments, Settings, UpdateFormat}; +use milli::{Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -11,7 +10,8 @@ mod query_criteria; pub const TEST_QUERY: &'static str = "hello world america"; -pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; +pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = + &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); @@ -27,16 +27,16 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let criteria = criteria.iter().map(|c| c.to_string()).collect(); builder.set_criteria(criteria); - builder.set_filterable_fields(hashset!{ + builder.set_filterable_fields(hashset! { S("tag"), S("asc_desc_rank"), }); - builder.set_synonyms(hashmap!{ + builder.set_synonyms(hashmap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], S("america") => vec![S("the united states")], }); - builder.set_searchable_fields(vec![S("title"),S("description")]); + builder.set_searchable_fields(vec![S("title"), S("description")]); builder.execute(|_, _| ()).unwrap(); // index documents @@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let mut rtxn = index.read_txn().unwrap(); let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); - let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + let docid_map: std::collections::HashMap<_, _> = + EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() } -pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec { - let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); +pub fn expected_order( + criteria: &[Criterion], + authorize_typo: bool, + optional_words: bool, +) -> Vec { + let dataset = + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut groups: Vec> = vec![dataset]; for criterion in criteria { @@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor match criterion { Criterion::Attribute => { group.sort_by_key(|d| d.attribute_rank); - new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); + } Criterion::Exactness => { group.sort_by_key(|d| d.exact_rank); new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); - }, + } Criterion::Proximity => { group.sort_by_key(|d| d.proximity_rank); - new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); + } Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); - }, + } Criterion::Words => { group.sort_by_key(|d| d.word_rank); new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); - }, + } Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| d.asc_desc_rank); - new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); - }, - Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } + Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); - new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), } } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 2b9c5ae5e..19173bc72 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,9 +1,9 @@ use big_s::S; use milli::update::Settings; -use milli::{Search, SearchResult, Criterion}; +use milli::{Criterion, Search, SearchResult}; +use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; -use Criterion::*; const ALLOW_TYPOS: bool = true; const DISALLOW_TYPOS: bool = false; @@ -35,29 +35,54 @@ macro_rules! test_criterion { } } +#[rustfmt::skip] test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); +#[rustfmt::skip] test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); +#[rustfmt::skip] test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); +#[rustfmt::skip] test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); +#[rustfmt::skip] test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); +#[rustfmt::skip] test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); +#[rustfmt::skip] test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); +#[rustfmt::skip] test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); +#[rustfmt::skip] test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); +#[rustfmt::skip] test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); #[test] fn criteria_mixup() { use Criterion::*; - let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); + let index = search::setup_search_index_with_criteria(&vec![ + Words, + Attribute, + Desc(S("asc_desc_rank")), + Exactness, + Proximity, + Typo, + ]); + #[rustfmt::skip] let criteria_mix = { // Criterion doesn't implement Copy, we create a new Criterion using a closure let desc = || Desc(S("asc_desc_rank")); @@ -205,10 +230,11 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); diff --git a/script/pre-commit b/script/pre-commit new file mode 100755 index 000000000..4819a3b52 --- /dev/null +++ b/script/pre-commit @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +cargo check --workspace --all-targets &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project does not compile. You might want to fix your error before commiting. + +If you still want to commit you can do it by appending +--no-verify +at the end of your previous command. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi + +cargo fmt --all -- --check &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project is badly formatted. Please run: +cargo fmt --all + +If you want to create your commit without propper formatting you can add +--no-verify +at the end of your commit. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi diff --git a/qc_loop.sh b/script/qc_loop.sh similarity index 100% rename from qc_loop.sh rename to script/qc_loop.sh diff --git a/search/src/main.rs b/search/src/main.rs index f7f95b730..fba714dab 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -6,10 +6,9 @@ use std::time::Instant; use byte_unit::Byte; use heed::EnvOpenOptions; use log::debug; +use milli::{obkv_to_json, Index}; use structopt::StructOpt; -use milli::{Index, obkv_to_json}; - #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; @@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> { } if opt.print_facet_distribution { - let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; + let facets = + index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; serde_json::to_writer(&mut stdout, &facets)?; let _ = writeln!(&mut stdout); }