From 2341b993795df0c8dc208a4009715cae1f06170e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Nov 2020 13:21:22 +0100 Subject: [PATCH 01/30] Support a basic facet based query system --- http-ui/src/main.rs | 11 ++- src/lib.rs | 2 +- src/search.rs | 165 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 166 insertions(+), 12 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b730344f2..d05b69f2c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -28,7 +28,7 @@ use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -550,9 +550,12 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../public/logo-black.svg")) ); - #[derive(Deserialize)] + #[derive(Debug, Deserialize)] + #[serde(deny_unknown_fields)] + #[serde(rename_all = "camelCase")] struct QueryBody { query: Option, + facet_condition: Option, } let disable_highlighting = opt.disable_highlighting; @@ -569,6 +572,10 @@ async fn main() -> anyhow::Result<()> { if let Some(query) = query.query { search.query(query); } + if let Some(condition) = query.facet_condition { + let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); + search.facet_condition(condition); + } let SearchResult { found_words, documents_ids } = search.execute().unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 12a24a59c..ff578dd4b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,7 @@ pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::index::Index; -pub use self::search::{Search, SearchResult}; +pub use self::search::{Search, FacetCondition, SearchResult}; pub use self::heed_codec::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, diff --git a/src/search.rs b/src/search.rs index ae2b5d127..17f25edfc 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,8 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; +use std::fmt; +use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; @@ -8,8 +10,10 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::query_tokens::{QueryTokens, QueryToken}; +use crate::facet::FacetType; +use crate::heed_codec::{CboRoaringBitmapCodec, facet::FacetValueI64Codec}; use crate::mdfs::Mdfs; +use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; // Building these factories is not free. @@ -17,8 +21,91 @@ static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); +// TODO support also floats +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum FacetOperator { + GreaterThan(i64), + GreaterThanOrEqual(i64), + LowerThan(i64), + LowerThanOrEqual(i64), + Equal(i64), + Between(i64, i64), +} + +// TODO also support ANDs, ORs, NOTs. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum FacetCondition { + Operator(u8, FacetOperator), +} + +impl FacetCondition { + pub fn from_str( + rtxn: &heed::RoTxn, + index: &Index, + string: &str, + ) -> anyhow::Result> + { + use FacetCondition::*; + use FacetOperator::*; + + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + + // TODO use a better parsing technic + let mut iter = string.split_whitespace(); + + let field_name = match iter.next() { + Some(field_name) => field_name, + None => return Ok(None), + }; + + let field_id = fields_ids_map.id(&field_name).with_context(|| format!("field {} not found", field_name))?; + let field_type = faceted_fields.get(&field_id).with_context(|| format!("field {} is not faceted", field_name))?; + + ensure!(*field_type == FacetType::Integer, "Only conditions on integer facets"); + + match iter.next() { + Some(">") => { + let param = iter.next().context("missing parameter")?; + let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; + Ok(Some(Operator(field_id, GreaterThan(value)))) + }, + Some(">=") => { + let param = iter.next().context("missing parameter")?; + let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; + Ok(Some(Operator(field_id, GreaterThanOrEqual(value)))) + }, + Some("<") => { + let param = iter.next().context("missing parameter")?; + let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; + Ok(Some(Operator(field_id, LowerThan(value)))) + }, + Some("<=") => { + let param = iter.next().context("missing parameter")?; + let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; + Ok(Some(Operator(field_id, LowerThanOrEqual(value)))) + }, + Some("=") => { + let param = iter.next().context("missing parameter")?; + let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; + Ok(Some(Operator(field_id, Equal(value)))) + }, + Some(otherwise) => { + // BETWEEN or X TO Y (both inclusive) + let left_param = otherwise.parse().with_context(|| format!("invalid first TO parameter ({:?})", otherwise))?; + ensure!(iter.next().map_or(false, |s| s.eq_ignore_ascii_case("to")), "TO keyword missing or invalid"); + let next = iter.next().context("missing second TO parameter")?; + let right_param = next.parse().with_context(|| format!("invalid second TO parameter ({:?})", next))?; + Ok(Some(Operator(field_id, Between(left_param, right_param)))) + }, + None => bail!("missing facet filter first parameter"), + } + } +} + pub struct Search<'a> { query: Option, + facet_condition: Option, offset: usize, limit: usize, rtxn: &'a heed::RoTxn<'a>, @@ -27,7 +114,7 @@ pub struct Search<'a> { impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { - Search { query: None, offset: 0, limit: 20, rtxn, index } + Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } } pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { @@ -45,6 +132,11 @@ impl<'a> Search<'a> { self } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { + self.facet_condition = Some(condition); + self + } + /// Extracts the query words from the query string and returns the DFAs accordingly. /// TODO introduce settings for the number of typos regarding the words lengths. fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { @@ -135,22 +227,66 @@ impl<'a> Search<'a> { pub fn execute(&self) -> anyhow::Result { let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; // Construct the DFAs related to the query words. - let dfas = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => dfas, - _ => { + let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { + Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), + _otherwise => None, + }; + + // We create the original candidates with the facet conditions results. + let facet_candidates = match self.facet_condition { + Some(FacetCondition::Operator(fid, operator)) => { + use std::ops::Bound::{Included, Excluded}; + use FacetOperator::*; + // Make sure we always bound the ranges with the field id, as the facets + // values are all in the same database and prefixed by the field id. + let range = match operator { + GreaterThan(val) => (Excluded((fid, val)), Included((fid, i64::MAX))), + GreaterThanOrEqual(val) => (Included((fid, val)), Included((fid, i64::MAX))), + LowerThan(val) => (Included((fid, i64::MIN)), Excluded((fid, val))), + LowerThanOrEqual(val) => (Included((fid, i64::MIN)), Included((fid, val))), + Equal(val) => (Included((fid, val)), Included((fid, val))), + Between(left, right) => (Included((fid, left)), Included((fid, right))), + }; + + let mut candidates = RoaringBitmap::new(); + + let db = self.index.facet_field_id_value_docids; + let db = db.remap_types::(); + for result in db.range(self.rtxn, &range)? { + let ((_fid, _value), docids) = result?; + candidates.union_with(&docids); + } + + Some(candidates) + }, + None => None, + }; + + let (candidates, derived_words) = match (facet_candidates, derived_words) { + (Some(mut facet_candidates), Some(derived_words)) => { + let words_candidates = Self::compute_candidates(&derived_words); + facet_candidates.intersect_with(&words_candidates); + (facet_candidates, derived_words) + }, + (None, Some(derived_words)) => { + (Self::compute_candidates(&derived_words), derived_words) + }, + (Some(facet_candidates), None) => { + // If the query is not set or results in no DFAs but + // there is some facet conditions we return a placeholder. + let documents_ids = facet_candidates.iter().take(limit).collect(); + return Ok(SearchResult { documents_ids, ..Default::default() }) + }, + (None, None) => { // If the query is not set or results in no DFAs we return a placeholder. let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); return Ok(SearchResult { documents_ids, ..Default::default() }) }, }; - let derived_words = self.fetch_words_docids(&fst, dfas)?; - let candidates = Self::compute_candidates(&derived_words); - debug!("candidates: {:?}", candidates); // The mana depth first search is a revised DFS that explore @@ -175,6 +311,17 @@ impl<'a> Search<'a> { } } +impl fmt::Debug for Search<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Search") + .field("query", &self.query) + .field("facet_condition", &self.facet_condition) + .field("offset", &self.offset) + .field("limit", &self.limit) + .finish() + } +} + #[derive(Default)] pub struct SearchResult { pub found_words: HashSet, From 218eb972416f06a187b0ca42bbbd18d866056c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Nov 2020 14:01:41 +0100 Subject: [PATCH 02/30] Introduce an input field for the facet filters on the http-ui --- http-ui/public/script.js | 9 +++++---- http-ui/src/main.rs | 5 +++-- http-ui/templates/index.html | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 70b9e4da1..fb7a95cc9 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -1,8 +1,9 @@ var request = null; var timeoutID = null; -$('#search').on('input', function () { - var query = $(this).val(); +$('#query, #facet').on('input', function () { + var query = $('#query').val(); + var facet = $('#facet').val(); var timeoutMs = 100; if (timeoutID !== null) { @@ -14,7 +15,7 @@ $('#search').on('input', function () { type: "POST", url: "query", contentType: 'application/json', - data: JSON.stringify({ 'query': query }), + data: JSON.stringify({ 'query': query, 'facetCondition': facet }), contentType: 'application/json', success: function (data, textStatus, request) { results.innerHTML = ''; @@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) { // We trigger the input when we load the script, this way // we execute a placeholder search when the input is empty. $(window).on('load', function () { - $('#search').trigger('input'); + $('#query').trigger('input'); }); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index d05b69f2c..9671576f6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -573,8 +573,9 @@ async fn main() -> anyhow::Result<()> { search.query(query); } if let Some(condition) = query.facet_condition { - let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); - search.facet_condition(condition); + if let Some(condition) = FacetCondition::from_str(&rtxn, &index, &condition).unwrap() { + search.facet_condition(condition); + } } let SearchResult { found_words, documents_ids } = search.execute().unwrap(); diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html index f2161457d..0ef239622 100644 --- a/http-ui/templates/index.html +++ b/http-ui/templates/index.html @@ -55,7 +55,8 @@
- + +
From b255be93fa356fb2ae75a3a3a121a808a86366a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Nov 2020 16:16:15 +0100 Subject: [PATCH 03/30] Bump heed to 0.10.3 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- http-ui/Cargo.lock | 4 ++-- http-ui/Cargo.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ddb2e9ec5..f32d2f133 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" +checksum = "8d2740ccbbfb2a6e6ff0c43e0fc14981ed668fb45be5a4e7b2bc03fc8cca3d3e" dependencies = [ "byteorder", "heed-traits", diff --git a/Cargo.toml b/Cargo.toml index fd453a5f2..bee4ebddc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ flate2 = "1.0.17" fst = "0.4.4" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { version = "0.10.3", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 23fed5bbe..162ca96b2 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -654,9 +654,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" +checksum = "8d2740ccbbfb2a6e6ff0c43e0fc14981ed668fb45be5a4e7b2bc03fc8cca3d3e" dependencies = [ "byteorder", "heed-traits", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 7e28e1211..e8d30a5ce 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.28" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = "0.10.1" +heed = "0.10.3" memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" From 9b7e516a56d97debcde8b350f7b78107b4d8aaee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Nov 2020 19:43:50 +0100 Subject: [PATCH 04/30] Fix the indexing process going back in time --- src/update/index_documents/store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 9c75f10fe..110ab7f25 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -399,7 +399,7 @@ impl Store { // We skip documents that must not be indexed by this thread. if count % num_threads == thread_index { // This is a log routine that we do every `log_every_n` documents. - if log_every_n.map_or(false, |len| count % len == 0) { + if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); progress_callback(UpdateIndexingStep::IndexDocuments { documents_seen: count, From fd8360deb12e0cd3ff258bf68c6afb142c49a8a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 15 Nov 2020 11:06:51 +0100 Subject: [PATCH 05/30] Update the facet indexing facet test --- src/update/settings.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/update/settings.rs b/src/update/settings.rs index 03f184ef6..141abcf00 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -412,6 +412,21 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); + let count = index.facet_field_id_value_docids.len(&rtxn).unwrap(); + assert_eq!(count, 3); + drop(rtxn); + + // Index a little more documents with new and current facets values. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let count = index.facet_field_id_value_docids.len(&rtxn).unwrap(); + assert_eq!(count, 4); drop(rtxn); } } From 57d253aeda2000c2166f1ed70f54f34f47c0b0df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 15 Nov 2020 11:58:19 +0100 Subject: [PATCH 06/30] Improve the infos biggest-value subcommand to support facets --- src/subcommand/infos.rs | 71 +++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 0c7fa36c6..6a69d1bad 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -229,42 +229,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho use std::cmp::Reverse; use std::collections::BinaryHeap; use heed::types::{Str, ByteSlice}; - use crate::heed_codec::BEU32StrCodec; + use crate::facet::FacetType; + use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; + + let Index { + env: _env, + main, + word_docids, + docid_word_positions, + word_pair_proximity_docids, + facet_field_id_value_docids, + documents, + } = index; let main_name = "main"; let word_docids_name = "word_docids"; let docid_word_positions_name = "docid_word_positions"; + let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let facet_field_id_value_docids_name = "facet_field_id_value_docids"; + let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); if limit > 0 { let words_fst = index.words_fst(rtxn)?; - heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); if heap.len() > limit { heap.pop(); } - if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? { - heap.push(Reverse((documents.len(), format!("documents"), main_name))); - if heap.len() > limit { heap.pop(); } - } - - if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { + if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); if heap.len() > limit { heap.pop(); } } - for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? { + for result in word_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); if heap.len() > limit { heap.pop(); } } - for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? { + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); heap.push(Reverse((value.len(), key, docid_word_positions_name))); if heap.len() > limit { heap.pop(); } } + + for result in word_pair_proximity_docids.remap_data_type::().iter(rtxn)? { + let ((word1, word2, prox), value) = result?; + let key = format!("{} {} {}", word1, word2, prox); + heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); + if heap.len() > limit { heap.pop(); } + } + + let faceted_fields = index.faceted_fields(rtxn)?; + let fields_ids_map = index.fields_ids_map(rtxn)?; + for (field_id, field_type) in faceted_fields { + let facet_name = fields_ids_map.name(field_id).unwrap(); + let iter = facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; + let iter = match field_type { + FacetType::String => { + let iter = iter.remap_types::() + .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); + Box::new(iter) as Box> + }, + FacetType::Float => { + let iter = iter.remap_types::() + .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); + Box::new(iter) + }, + FacetType::Integer => { + let iter = iter.remap_types::() + .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); + Box::new(iter) + }, + }; + for result in iter { + let (fvalue, value) = result?; + let key = format!("{} {}", facet_name, fvalue); + heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); + if heap.len() > limit { heap.pop(); } + } + } + + for result in documents.remap_data_type::().iter(rtxn)? { + let (id, value) = result?; + heap.push(Reverse((value.len(), id.to_string(), documents_name))); + if heap.len() > limit { heap.pop(); } + } } let stdout = io::stdout(); From 9ec95679e17723d9cffb774a5bf16215723a78ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Nov 2020 16:29:07 +0100 Subject: [PATCH 07/30] Introduce a function to retrieve the facet level range docids --- .../facet/facet_level_value_f64_codec.rs | 62 +++++++++ .../facet/facet_level_value_i64_codec.rs | 43 ++++++ src/heed_codec/facet/facet_value_f64_codec.rs | 50 ------- src/heed_codec/facet/facet_value_i64_codec.rs | 28 ---- src/heed_codec/facet/mod.rs | 8 +- src/search.rs | 115 +++++++++++++--- src/subcommand/infos.rs | 103 +++++++-------- src/update/index_documents/facet_level.rs | 125 ++++++++++++++++++ src/update/index_documents/mod.rs | 41 +++++- src/update/index_documents/store.rs | 6 +- src/update/settings.rs | 6 +- 11 files changed, 423 insertions(+), 164 deletions(-) create mode 100644 src/heed_codec/facet/facet_level_value_f64_codec.rs create mode 100644 src/heed_codec/facet/facet_level_value_i64_codec.rs delete mode 100644 src/heed_codec/facet/facet_value_f64_codec.rs delete mode 100644 src/heed_codec/facet/facet_value_i64_codec.rs create mode 100644 src/update/index_documents/facet_level.rs diff --git a/src/heed_codec/facet/facet_level_value_f64_codec.rs b/src/heed_codec/facet/facet_level_value_f64_codec.rs new file mode 100644 index 000000000..55fd4b132 --- /dev/null +++ b/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -0,0 +1,62 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::f64_into_bytes; + +// TODO do not de/serialize right bound when level = 0 +pub struct FacetLevelValueF64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { + type DItem = (u8, u8, f64, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; + let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; + + Some((*field_id, *level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { + type EItem = (u8, u8, f64, f64); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 32]; + + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + + let bytes = f64_into_bytes(*right)?; + buffer[8..16].copy_from_slice(&bytes[..]); + + // Then the f64 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[16..24].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[24..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 2); + bytes.push(*field_id); + bytes.push(*level); + bytes.extend_from_slice(&buffer[..]); + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::{BytesEncode, BytesDecode}; + use super::*; + + #[test] + fn globally_ordered_f64() { + let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, -32.0, 32.0)).unwrap(); + let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, level, left, right), (3, 0, -32.0, 32.0)); + } +} diff --git a/src/heed_codec/facet/facet_level_value_i64_codec.rs b/src/heed_codec/facet/facet_level_value_i64_codec.rs new file mode 100644 index 000000000..7cf9a714b --- /dev/null +++ b/src/heed_codec/facet/facet_level_value_i64_codec.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; + +pub struct FacetLevelValueI64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { + type DItem = (u8, u8, i64, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + let left = bytes[..8].try_into().map(i64_from_bytes).ok()?; + let right = if *level != 0 { + bytes[8..].try_into().map(i64_from_bytes).ok()? + } else { + left + }; + + Some((*field_id, *level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { + type EItem = (u8, u8, i64, i64); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let left = i64_into_bytes(*left); + let right = i64_into_bytes(*right); + + let mut bytes = Vec::with_capacity(2 + left.len() + right.len()); + bytes.push(*field_id); + bytes.push(*level); + bytes.extend_from_slice(&left[..]); + if *level != 0 { + bytes.extend_from_slice(&right[..]); + } + + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs deleted file mode 100644 index 228514de5..000000000 --- a/src/heed_codec/facet/facet_value_f64_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; - -pub struct FacetValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { - type DItem = (u8, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, buffer) = bytes.split_first()?; - let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; - Some((*field_id, value)) - } -} - -impl heed::BytesEncode<'_> for FacetValueF64Codec { - type EItem = (u8, f64); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the globally ordered float. - let bytes = f64_into_bytes(*value)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 value just to be able to read it back. - let bytes = value.to_be_bytes(); - buffer[8..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(&buffer[..]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesEncode, BytesDecode}; - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); - let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, value), (3, -32.0)); - } -} diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs deleted file mode 100644 index f99b8a3ea..000000000 --- a/src/heed_codec/facet/facet_value_i64_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; - -pub struct FacetValueI64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { - type DItem = (u8, i64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, buffer) = bytes.split_first()?; - let value = buffer.try_into().map(i64_from_bytes).ok()?; - Some((*field_id, value)) - } -} - -impl heed::BytesEncode<'_> for FacetValueI64Codec { - type EItem = (u8, i64); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let value = i64_into_bytes(*value); - let mut bytes = Vec::with_capacity(value.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(&value[..]); - Some(Cow::Owned(bytes)) - } -} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs index abe2c1d8a..ef97e6add 100644 --- a/src/heed_codec/facet/mod.rs +++ b/src/heed_codec/facet/mod.rs @@ -1,7 +1,7 @@ -mod facet_value_f64_codec; -mod facet_value_i64_codec; +mod facet_level_value_f64_codec; +mod facet_level_value_i64_codec; mod facet_value_string_codec; -pub use self::facet_value_f64_codec::FacetValueF64Codec; -pub use self::facet_value_i64_codec::FacetValueI64Codec; +pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; diff --git a/src/search.rs b/src/search.rs index 17f25edfc..3338d5222 100644 --- a/src/search.rs +++ b/src/search.rs @@ -4,6 +4,7 @@ use std::fmt; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; @@ -11,7 +12,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::{CboRoaringBitmapCodec, facet::FacetValueI64Codec}; +use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -238,29 +239,105 @@ impl<'a> Search<'a> { // We create the original candidates with the facet conditions results. let facet_candidates = match self.facet_condition { Some(FacetCondition::Operator(fid, operator)) => { - use std::ops::Bound::{Included, Excluded}; + use std::ops::Bound::{self, Included, Excluded}; use FacetOperator::*; - // Make sure we always bound the ranges with the field id, as the facets - // values are all in the same database and prefixed by the field id. - let range = match operator { - GreaterThan(val) => (Excluded((fid, val)), Included((fid, i64::MAX))), - GreaterThanOrEqual(val) => (Included((fid, val)), Included((fid, i64::MAX))), - LowerThan(val) => (Included((fid, i64::MIN)), Excluded((fid, val))), - LowerThanOrEqual(val) => (Included((fid, i64::MIN)), Included((fid, val))), - Equal(val) => (Included((fid, val)), Included((fid, val))), - Between(left, right) => (Included((fid, left)), Included((fid, right))), - }; - let mut candidates = RoaringBitmap::new(); + fn explore_facet_levels( + rtxn: &heed::RoTxn, + db: &heed::Database, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + candidates: &mut RoaringBitmap, + ) -> anyhow::Result<()> + { + let mut left_found = left; + let mut right_found = right; - let db = self.index.facet_field_id_value_docids; - let db = db.remap_types::(); - for result in db.range(self.rtxn, &range)? { - let ((_fid, _value), docids) = result?; - candidates.union_with(&docids); + let range = { + let left = match left { + Included(left) => Included((field_id, level, left, i64::MIN)), + Excluded(left) => Excluded((field_id, level, left, i64::MIN)), + Bound::Unbounded => Bound::Unbounded, + }; + let right = Included((field_id, level, i64::MAX, i64::MAX)); + (left, right) + }; + + for (i, result) in db.range(rtxn, &range)?.enumerate() { + let ((_fid, _level, l, r), docids) = result?; + match right { + Included(right) if r > right => break, + Excluded(right) if r >= right => break, + _ => (), + } + + eprintln!("{} to {} (level {})", l, r, _level); + candidates.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Excluded(l); } + right_found = Excluded(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + // TODO we must avoid going at deeper when the bounds are already satisfied. + explore_facet_levels(rtxn, db, field_id, deeper_level, left, left_found, candidates)?; + explore_facet_levels(rtxn, db, field_id, deeper_level, right_found, right, candidates)?; + + Ok(()) } - Some(candidates) + // Make sure we always bound the ranges with the field id, as the facets + // values are all in the same database and prefixed by the field id. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(i64::MAX)), + GreaterThanOrEqual(val) => (Included(val), Included(i64::MAX)), + LowerThan(val) => (Included(i64::MIN), Excluded(val)), + LowerThanOrEqual(val) => (Included(i64::MIN), Included(val)), + Equal(val) => (Included(val), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + let db = self.index + .facet_field_id_value_docids + .remap_key_type::(); + + let biggest_level = match fid.checked_add(1) { + Some(next_fid) => { + // If we are able to find the next field id we ask the key that is before + // the first entry of it which corresponds to the last key of our field id. + let db = db.remap_data_type::(); + match db.get_lower_than(self.rtxn, &(next_fid, 0, i64::MIN, i64::MIN))? { + Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + _ => None, + } + }, + None => { + // If we can't generate a bigger field id, it must be equal to 255 and + // therefore the last key of the database must be the one we want. + match db.remap_data_type::().last(self.rtxn)? { + Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + _ => None, + } + }, + }; + + match biggest_level { + Some(level) => { + let mut candidates = RoaringBitmap::new(); + explore_facet_levels(self.rtxn, &db, fid, level, left, right, &mut candidates)?; + Some(candidates) + }, + None => None, + } }, None => None, }; diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 6a69d1bad..74b1505f9 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -2,9 +2,9 @@ use std::path::PathBuf; use std::{str, io}; use anyhow::Context; -use crate::Index; use heed::EnvOpenOptions; use structopt::StructOpt; +use crate::Index; use Command::*; @@ -225,12 +225,52 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: Ok(wtr.flush()?) } +fn facet_values_iter<'txn, DC: 'txn>( + rtxn: &'txn heed::RoTxn, + db: heed::Database, + field_id: u8, + facet_type: crate::facet::FacetType, +) -> heed::Result> + 'txn>> +where + DC: heed::BytesDecode<'txn>, +{ + use crate::facet::FacetType; + use crate::heed_codec::facet::{ + FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, + }; + + let iter = db.prefix_iter(&rtxn, &[field_id])?; + match facet_type { + FacetType::String => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|((_, key), value)| (key.to_string(), value))); + Ok(Box::new(iter) as Box>) + }, + FacetType::Float => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|((_, level, left, right), value)| if level == 0 { + (format!("{} (level {})", left, level), value) + } else { + (format!("{} to {} (level {})", left, right, level), value) + })); + Ok(Box::new(iter)) + }, + FacetType::Integer => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|((_, level, left, right), value)| if level == 0 { + (format!("{} (level {})", left, level), value) + } else { + (format!("{} to {} (level {})", left, right, level), value) + })); + Ok(Box::new(iter)) + }, + } +} + fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::cmp::Reverse; use std::collections::BinaryHeap; use heed::types::{Str, ByteSlice}; - use crate::facet::FacetType; - use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; let Index { env: _env, @@ -285,25 +325,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { let facet_name = fields_ids_map.name(field_id).unwrap(); - let iter = facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; - let iter = match field_type { - FacetType::String => { - let iter = iter.remap_types::() - .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); - Box::new(iter) as Box> - }, - FacetType::Float => { - let iter = iter.remap_types::() - .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); - Box::new(iter) - }, - FacetType::Integer => { - let iter = iter.remap_types::() - .map(|r| r.map(|((_, k), v)| (k.to_string(), v))); - Box::new(iter) - }, - }; - for result in iter { + + let db = facet_field_id_value_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, field_id, field_type)? { let (fvalue, value) = result?; let key = format!("{} {}", facet_name, fvalue); heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); @@ -349,10 +373,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec anyhow::Result<()> { - use crate::facet::FacetType; - use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; - use heed::{BytesDecode, Error::Decoding}; - let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields(&rtxn)?; @@ -361,39 +381,12 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam let field_type = faceted_fields.get(&field_id) .with_context(|| format!("field {} is not faceted", field_name))?; - let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; - let iter = match field_type { - FacetType::String => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) as Box> - }, - FacetType::Float => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) - }, - FacetType::Integer => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) - }, - }; - let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["facet_value", "documents_ids"])?; - for result in iter { + let db = index.facet_field_id_value_docids; + for result in facet_values_iter(rtxn, db, field_id, *field_type)? { let (value, docids) = result?; let docids = if debug { format!("{:?}", docids) diff --git a/src/update/index_documents/facet_level.rs b/src/update/index_documents/facet_level.rs new file mode 100644 index 000000000..0a87e21d6 --- /dev/null +++ b/src/update/index_documents/facet_level.rs @@ -0,0 +1,125 @@ +use std::fs::File; + +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesEncode, Error}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; +use crate::update::index_documents::{create_writer, writer_into_reader}; + +pub fn clear_field_levels( + wtxn: &mut heed::RwTxn, + db: heed::Database, + field_id: u8, +) -> heed::Result<()> +{ + let range = (field_id, 1, i64::MIN, i64::MIN)..=(field_id, u8::MAX, i64::MAX, i64::MAX); + db.remap_key_type::() + .delete_range(wtxn, &range) + .map(drop) +} + +pub fn compute_facet_levels( + rtxn: &heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + field_id: u8, + facet_type: FacetType, +) -> anyhow::Result> +{ + let last_level_size = 5; + let number_of_levels = 5; + let first_level_size = db.prefix_iter(rtxn, &[field_id])? + .remap_types::() + .fold(Ok(0u64), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; + + let level_0_range = (field_id, 0, i64::MIN, i64::MIN)..=(field_id, 0, i64::MAX, i64::MAX); + let level_sizes_iter = levels_iterator(first_level_size, last_level_size, number_of_levels) + .enumerate() + .skip(1); + + // TODO we must not create levels with identical group sizes. + for (level, size) in level_sizes_iter { + let level_entry_sizes = (first_level_size as f64 / size as f64).ceil() as usize; + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); + + let db = db.remap_key_type::(); + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; + + if i == 0 { + left = value; + } else if i % level_entry_sizes == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } + + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } + + if !group_docids.is_empty() { + write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_entry( + writer: &mut Writer, + field_id: u8, + level: u8, + left: i64, + right: i64, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +{ + let key = (field_id, level, left, right); + let key = FacetLevelValueI64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} + +fn levels_iterator( + first_level_size: u64, // biggest level + last_level_size: u64, // smallest level + number_of_levels: u64, +) -> impl Iterator +{ + // Go look at the function definitions here: + // https://docs.rs/easer/0.2.1/easer/index.html + // https://easings.net/#easeOutExpo + fn ease_out_expo(t: f64, b: f64, c: f64, d: f64) -> f64 { + if t == d { + b + c + } else { + c * (-2.0_f64.powf(-10.0 * t / d) + 1.0) + b + } + } + + let b = last_level_size as f64; + let end = first_level_size as f64; + let c = end - b; + let d = number_of_levels; + (0..=d).map(move |t| ((end + b) - ease_out_expo(t as f64, b, c, d as f64)) as u64) +} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index fe51c6b2b..f6587c3a8 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -14,6 +14,7 @@ use memmap::Mmap; use rayon::prelude::*; use rayon::ThreadPool; +use crate::facet::FacetType; use crate::index::Index; use crate::update::UpdateIndexingStep; use self::store::{Store, Readers}; @@ -22,10 +23,12 @@ use self::merge_function::{ docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; +pub use self::facet_level::{clear_field_levels, compute_facet_levels}; use crate::MergeFn; use super::UpdateBuilder; +mod facet_level; mod merge_function; mod store; mod transform; @@ -327,7 +330,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, - FacetValuesDocids, + FacetLevel0ValuesDocids, } let faceted_fields = self.index.faceted_fields(self.wtxn)?; @@ -427,7 +430,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), ( - DatabaseType::FacetValuesDocids, + DatabaseType::FacetLevel0ValuesDocids, facet_field_value_docids_readers, facet_field_value_docids_merge, ), @@ -475,6 +478,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; + // We get the faceted fields to be able to create the facet levels. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + // We merge the new documents ids with the existing ones. documents_ids.union_with(&new_documents_ids); documents_ids.union_with(&replaced_documents_ids); @@ -557,7 +563,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, - DatabaseType::FacetValuesDocids => { + DatabaseType::FacetLevel0ValuesDocids => { debug!("Writing the facet values docids into LMDB on disk..."); let db = *self.index.facet_field_id_value_docids.as_polymorph(); write_into_lmdb_database( @@ -577,6 +583,35 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + for (field_id, facet_type) in faceted_fields { + if facet_type == FacetType::String { continue } + + clear_field_levels( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels( + self.wtxn, + self.index.facet_field_id_value_docids, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + field_id, + facet_type, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + } + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 110ab7f25..289704b1a 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -19,7 +19,7 @@ use tempfile::tempfile; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; @@ -337,8 +337,8 @@ impl Store { for ((field_id, value), docids) in iter { let result = match value { String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), - Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), - Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), + Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), + Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned), }; let key = result.context("could not serialize facet key")?; let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) diff --git a/src/update/settings.rs b/src/update/settings.rs index 141abcf00..cddd68ca3 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -412,7 +412,8 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); - let count = index.facet_field_id_value_docids.len(&rtxn).unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); assert_eq!(count, 3); drop(rtxn); @@ -425,7 +426,8 @@ mod tests { wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); - let count = index.facet_field_id_value_docids.len(&rtxn).unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); assert_eq!(count, 4); drop(rtxn); } From 7a6e6eb5e21f6d7e57b64da6f3eb7e06603f01f3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Nov 2020 14:50:32 +0100 Subject: [PATCH 08/30] Introduce a facets stats infos subcommand --- src/subcommand/infos.rs | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 74b1505f9..535045392 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -89,6 +89,12 @@ enum Command { field_name: String, }, + /// Outputs some facets statistics for the given facet name. + FacetStats { + /// The field name in the document. + field_name: String, + }, + /// Outputs the total size of all the docid-word-positions keys and values. TotalDocidWordPositionsSize, @@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -399,6 +406,62 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { + use crate::facet::FacetType; + use crate::heed_codec::facet::{ + FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, + }; + + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let faceted_fields = index.faceted_fields(&rtxn)?; + + let field_id = fields_ids_map.id(&field_name) + .with_context(|| format!("field {} not found", field_name))?; + let field_type = faceted_fields.get(&field_id) + .with_context(|| format!("field {} is not faceted", field_name))?; + + let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; + let iter = match field_type { + FacetType::String => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|_| 0u8)); + Box::new(iter) as Box> + }, + FacetType::Float => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|((_, level, _, _), _)| level)); + Box::new(iter) + }, + FacetType::Integer => { + let iter = iter.remap_key_type::() + .map(|r| r.map(|((_, level, _, _), _)| level)); + Box::new(iter) + }, + }; + + println!("The database {:?} facet stats", field_name); + + let mut level_size = 0; + let mut current_level = None; + for result in iter { + let level = result?; + if let Some(current) = current_level { + if current != level { + println!("\tnumber of groups at level {}: {}", current, level_size); + level_size = 0; + } + } + current_level = Some(level); + level_size += 1; + } + + if let Some(current) = current_level { + println!("\tnumber of groups at level {}: {}", current, level_size); + } + + Ok(()) +} + fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { use std::fs::File; use std::io::Write as _; From 45e0feab4e8e3dad6fb469c4223b6a189491c085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Nov 2020 21:18:34 +0100 Subject: [PATCH 09/30] Speed up the facets stats infos subcommand --- src/subcommand/infos.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 535045392..3b9c174b8 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -407,6 +407,7 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam } fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { + use heed::types::ByteSlice; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, @@ -423,17 +424,17 @@ fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow: let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; let iter = match field_type { FacetType::String => { - let iter = iter.remap_key_type::() + let iter = iter.remap_types::() .map(|r| r.map(|_| 0u8)); Box::new(iter) as Box> }, FacetType::Float => { - let iter = iter.remap_key_type::() + let iter = iter.remap_types::() .map(|r| r.map(|((_, level, _, _), _)| level)); Box::new(iter) }, FacetType::Integer => { - let iter = iter.remap_key_type::() + let iter = iter.remap_types::() .map(|r| r.map(|((_, level, _, _), _)| level)); Box::new(iter) }, From 67d4a1b3fc415748b6744adf1340ac5d83215050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Nov 2020 21:19:25 +0100 Subject: [PATCH 10/30] Introduce a new update for the facet levels --- http-ui/src/main.rs | 57 ++++- src/update/facet_levels.rs | 247 ++++++++++++++++++++++ src/update/index_documents/facet_level.rs | 125 ----------- src/update/index_documents/mod.rs | 64 ++---- src/update/mod.rs | 2 + src/update/update_builder.rs | 17 +- 6 files changed, 344 insertions(+), 168 deletions(-) create mode 100644 src/update/facet_levels.rs delete mode 100644 src/update/index_documents/facet_level.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 9671576f6..e03261641 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; +use std::num::NonZeroUsize; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -27,7 +28,7 @@ use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::UpdateIndexingStep::*; -use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; +use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat, EasingName}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -196,6 +197,7 @@ enum UpdateMeta { DocumentsAddition { method: String, format: String }, ClearDocuments, Settings(Settings), + FacetLevels(FacetLevels), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -231,6 +233,15 @@ struct Settings { faceted_attributes: Option>, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct FacetLevels { + last_level_size: Option, + number_of_levels: Option, + easing_function: Option, +} + // Any value that is present is considered Some value, including null. fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> where T: Deserialize<'de>, @@ -399,6 +410,36 @@ async fn main() -> anyhow::Result<()> { Ok(_count) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } + }, + UpdateMeta::FacetLevels(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.facet_levels(&mut wtxn, &index_cloned); + if let Some(value) = levels.last_level_size { + builder.last_level_size(value); + } + if let Some(value) = levels.number_of_levels { + builder.number_of_levels(value); + } + if let Some(value) = levels.easing_function { + let easing_name = if value.eq_ignore_ascii_case("expo") { + EasingName::Expo + } else if value.eq_ignore_ascii_case("quart") { + EasingName::Quart + } else if value.eq_ignore_ascii_case("circ") { + EasingName::Circ + } else if value.eq_ignore_ascii_case("linear") { + EasingName::Linear + } else { + panic!("Invalid easing function name") + }; + builder.easing_function(easing_name); + } + + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()) + } } }; @@ -759,6 +800,19 @@ async fn main() -> anyhow::Result<()> { Ok(warp::reply()) }); + let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); + let change_facet_levels_route = warp::filters::method::post() + .and(warp::path!("facet-levels")) + .and(warp::body::json()) + .map(move |levels: FacetLevels| { + let meta = UpdateMeta::FacetLevels(levels); + let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); + let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); + eprintln!("update {} registered", update_id); + warp::reply() + }); + let update_ws_route = warp::ws() .and(warp::path!("updates" / "ws")) .map(move |ws: warp::ws::Ws| { @@ -807,6 +861,7 @@ async fn main() -> anyhow::Result<()> { .or(indexing_json_stream_route) .or(clearing_route) .or(change_settings_route) + .or(change_facet_levels_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; diff --git a/src/update/facet_levels.rs b/src/update/facet_levels.rs new file mode 100644 index 000000000..bc8f7121f --- /dev/null +++ b/src/update/facet_levels.rs @@ -0,0 +1,247 @@ +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesEncode, Error}; +use itertools::Itertools; +use log::debug; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; +use crate::Index; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; + +#[derive(Debug, Copy, Clone)] +pub enum EasingName { + Expo, + Quart, + Circ, + Linear, +} + +pub struct FacetLevels<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + number_of_levels: NonZeroUsize, + last_level_size: NonZeroUsize, + easing_function: EasingName, +} + +impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> FacetLevels<'t, 'u, 'i> { + FacetLevels { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + number_of_levels: NonZeroUsize::new(5).unwrap(), + last_level_size: NonZeroUsize::new(5).unwrap(), + easing_function: EasingName::Expo, + } + } + + pub fn number_of_levels(&mut self, value: NonZeroUsize) -> &mut Self { + self.number_of_levels = value; + self + } + + pub fn last_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.last_level_size = value; + self + } + + pub fn easing_function(&mut self, value: EasingName) -> &mut Self { + self.easing_function = value; + self + } + + pub fn execute(self) -> anyhow::Result<()> { + // We get the faceted fields to be able to create the facet levels. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + for (field_id, facet_type) in faceted_fields { + if facet_type == FacetType::String { continue } + + clear_field_levels( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels( + self.wtxn, + self.index.facet_field_id_value_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.last_level_size, + self.number_of_levels, + self.easing_function, + field_id, + facet_type, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + } + + Ok(()) + } +} + +fn clear_field_levels( + wtxn: &mut heed::RwTxn, + db: heed::Database, + field_id: u8, +) -> heed::Result<()> +{ + let range = (field_id, 1, i64::MIN, i64::MIN)..=(field_id, u8::MAX, i64::MAX, i64::MAX); + db.remap_key_type::() + .delete_range(wtxn, &range) + .map(drop) +} + +fn compute_facet_levels( + rtxn: &heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + last_level_size: NonZeroUsize, + number_of_levels: NonZeroUsize, + easing_function: EasingName, + field_id: u8, + facet_type: FacetType, +) -> anyhow::Result> +{ + let first_level_size = db.prefix_iter(rtxn, &[field_id])? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; + + let level_0_range = (field_id, 0, i64::MIN, i64::MIN)..=(field_id, 0, i64::MAX, i64::MAX); + let level_sizes_iter = + levels_iterator(first_level_size, last_level_size.get(), number_of_levels.get(), easing_function) + .map(|size| (first_level_size as f64 / size as f64).ceil() as usize) + .unique() + .enumerate() + .skip(1); + + // TODO we must not create levels with identical group sizes. + for (level, level_entry_sizes) in level_sizes_iter { + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); + + dbg!(level, level_entry_sizes, first_level_size); + + let db = db.remap_key_type::(); + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; + + if i == 0 { + left = value; + } else if i % level_entry_sizes == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } + + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } + + if !group_docids.is_empty() { + write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_entry( + writer: &mut Writer, + field_id: u8, + level: u8, + left: i64, + right: i64, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +{ + let key = (field_id, level, left, right); + let key = FacetLevelValueI64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} + +fn levels_iterator( + first_level_size: usize, // biggest level + last_level_size: usize, // smallest level + number_of_levels: usize, + easing_function: EasingName, +) -> impl Iterator +{ + let easing_function = match easing_function { + EasingName::Expo => ease_out_expo, + EasingName::Quart => ease_out_quart, + EasingName::Circ => ease_out_circ, + EasingName::Linear => ease_out_linear, + }; + + let b = last_level_size as f64; + let end = first_level_size as f64; + let c = end - b; + let d = number_of_levels; + (0..=d).map(move |t| ((end + b) - easing_function(t as f64, b, c, d as f64)) as usize) +} + +// Go look at the function definitions here: +// https://docs.rs/easer/0.2.1/easer/index.html +// https://easings.net/#easeOutExpo +fn ease_out_expo(t: f64, b: f64, c: f64, d: f64) -> f64 { + if t == d { + b + c + } else { + c * (-2.0_f64.powf(-10.0 * t / d) + 1.0) + b + } +} + +// https://easings.net/#easeOutCirc +fn ease_out_circ(t: f64, b: f64, c: f64, d: f64) -> f64 { + let t = t / d - 1.0; + c * (1.0 - t * t).sqrt() + b +} + +// https://easings.net/#easeOutQuart +fn ease_out_quart(t: f64, b: f64, c: f64, d: f64) -> f64 { + let t = t / d - 1.0; + -c * ((t * t * t * t) - 1.0) + b +} + +fn ease_out_linear(t: f64, b: f64, c: f64, d: f64) -> f64 { + c * t / d + b +} diff --git a/src/update/index_documents/facet_level.rs b/src/update/index_documents/facet_level.rs deleted file mode 100644 index 0a87e21d6..000000000 --- a/src/update/index_documents/facet_level.rs +++ /dev/null @@ -1,125 +0,0 @@ -use std::fs::File; - -use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesEncode, Error}; -use roaring::RoaringBitmap; - -use crate::facet::FacetType; -use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; -use crate::update::index_documents::{create_writer, writer_into_reader}; - -pub fn clear_field_levels( - wtxn: &mut heed::RwTxn, - db: heed::Database, - field_id: u8, -) -> heed::Result<()> -{ - let range = (field_id, 1, i64::MIN, i64::MIN)..=(field_id, u8::MAX, i64::MAX, i64::MAX); - db.remap_key_type::() - .delete_range(wtxn, &range) - .map(drop) -} - -pub fn compute_facet_levels( - rtxn: &heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - shrink_size: Option, - field_id: u8, - facet_type: FacetType, -) -> anyhow::Result> -{ - let last_level_size = 5; - let number_of_levels = 5; - let first_level_size = db.prefix_iter(rtxn, &[field_id])? - .remap_types::() - .fold(Ok(0u64), |count, result| result.and(count).map(|c| c + 1))?; - - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(compression_type, compression_level, file) - })?; - - let level_0_range = (field_id, 0, i64::MIN, i64::MIN)..=(field_id, 0, i64::MAX, i64::MAX); - let level_sizes_iter = levels_iterator(first_level_size, last_level_size, number_of_levels) - .enumerate() - .skip(1); - - // TODO we must not create levels with identical group sizes. - for (level, size) in level_sizes_iter { - let level_entry_sizes = (first_level_size as f64 / size as f64).ceil() as usize; - let mut left = 0; - let mut right = 0; - let mut group_docids = RoaringBitmap::new(); - - let db = db.remap_key_type::(); - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; - - if i == 0 { - left = value; - } else if i % level_entry_sizes == 0 { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. - write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; - - // We save the left bound for the new group and also reset the docids. - group_docids = RoaringBitmap::new(); - left = value; - } - - // The right bound is always the bound we run through. - group_docids.union_with(&docids); - right = value; - } - - if !group_docids.is_empty() { - write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; - } - } - - writer_into_reader(writer, shrink_size) -} - -fn write_entry( - writer: &mut Writer, - field_id: u8, - level: u8, - left: i64, - right: i64, - ids: &RoaringBitmap, -) -> anyhow::Result<()> -{ - let key = (field_id, level, left, right); - let key = FacetLevelValueI64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} - -fn levels_iterator( - first_level_size: u64, // biggest level - last_level_size: u64, // smallest level - number_of_levels: u64, -) -> impl Iterator -{ - // Go look at the function definitions here: - // https://docs.rs/easer/0.2.1/easer/index.html - // https://easings.net/#easeOutExpo - fn ease_out_expo(t: f64, b: f64, c: f64, d: f64) -> f64 { - if t == d { - b + c - } else { - c * (-2.0_f64.powf(-10.0 * t / d) + 1.0) + b - } - } - - let b = last_level_size as f64; - let end = first_level_size as f64; - let c = end - b; - let d = number_of_levels; - (0..=d).map(move |t| ((end + b) - ease_out_expo(t as f64, b, c, d as f64)) as u64) -} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index f6587c3a8..50f5336fc 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; +use std::num::NonZeroUsize; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -14,32 +15,29 @@ use memmap::Mmap; use rayon::prelude::*; use rayon::ThreadPool; -use crate::facet::FacetType; use crate::index::Index; -use crate::update::UpdateIndexingStep; +use crate::update::{FacetLevels, UpdateIndexingStep}; use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; -pub use self::facet_level::{clear_field_levels, compute_facet_levels}; use crate::MergeFn; use super::UpdateBuilder; -mod facet_level; mod merge_function; mod store; mod transform; #[derive(Debug, Copy, Clone)] -enum WriteMethod { +pub enum WriteMethod { Append, GetMergePut, } -fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { +pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { let mut builder = Writer::builder(); builder.compression_type(typ); if let Some(level) = level { @@ -48,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Re builder.build(file) } -fn create_sorter( +pub fn create_sorter( merge: MergeFn, chunk_compression_type: CompressionType, chunk_compression_level: Option, @@ -74,7 +72,7 @@ fn create_sorter( builder.build() } -fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { +pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -85,13 +83,13 @@ fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow: Reader::new(file).map_err(Into::into) } -fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { +pub fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() } -fn merge_into_lmdb_database( +pub fn merge_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, sources: Vec>, @@ -135,7 +133,7 @@ fn merge_into_lmdb_database( Ok(()) } -fn write_into_lmdb_database( +pub fn write_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, mut reader: Reader, @@ -210,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + facet_number_of_levels: Option, + facet_last_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -228,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_level: None, chunk_fusing_shrink_size: None, thread_pool: None, + facet_number_of_levels: None, + facet_last_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -478,9 +480,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields(self.wtxn)?; - // We merge the new documents ids with the existing ones. documents_ids.union_with(&new_documents_ids); documents_ids.union_with(&replaced_documents_ids); @@ -583,34 +582,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for (field_id, facet_type) in faceted_fields { - if facet_type == FacetType::String { continue } - - clear_field_levels( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - - let content = compute_facet_levels( - self.wtxn, - self.index.facet_field_id_value_docids, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - field_id, - facet_type, - )?; - - write_into_lmdb_database( - self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), - content, - |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, - )?; + let mut builder = FacetLevels::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.facet_number_of_levels { + builder.number_of_levels(value); } + if let Some(value) = self.facet_last_level_size { + builder.last_level_size(value); + } + builder.execute()?; debug_assert_eq!(database_count, total_databases); diff --git a/src/update/mod.rs b/src/update/mod.rs index 75724269a..87035065c 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -1,6 +1,7 @@ mod available_documents_ids; mod clear_documents; mod delete_documents; +mod facet_levels; mod index_documents; mod settings; mod update_builder; @@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::facet_levels::{FacetLevels, EasingName}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index 67ea04bfc..8f7f1a0a8 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, FacetLevels}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> { builder } + + pub fn facet_levels<'t, 'u, 'i>( + self, + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> FacetLevels<'t, 'u, 'i> + { + let mut builder = FacetLevels::new(wtxn, index); + + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + + builder + } } From 7d67c9e2e73db41eb614a61583904c6998a38619 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Nov 2020 14:50:32 +0100 Subject: [PATCH 11/30] Improve the facet search algorithm performances --- src/search.rs | 163 +++++++++++++++++++++++++++++++------------------- 1 file changed, 101 insertions(+), 62 deletions(-) diff --git a/src/search.rs b/src/search.rs index 3338d5222..cedca1085 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::ops::Bound::{self, Unbounded, Included, Excluded}; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; @@ -12,7 +13,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; +use crate::heed_codec::facet::FacetLevelValueI64Codec; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -226,6 +227,95 @@ impl<'a> Search<'a> { candidates } + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels( + &self, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return self.explore_facet_levels(field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, i64::MIN)), + Excluded(left) => Excluded((field_id, level, left, i64::MIN)), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, i64::MAX, i64::MAX)); + let db = self.index.facet_field_id_value_docids.remap_key_type::(); + let iter = db + .range(self.rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, _level, l, r), docids) = result?; + debug!("{} to {} (level {}) found {} documents", l, r, _level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + self.explore_facet_levels(field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + self.explore_facet_levels(field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + self.explore_facet_levels(field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + pub fn execute(&self) -> anyhow::Result { let limit = self.limit; let fst = self.index.words_fst(self.rtxn)?; @@ -239,64 +329,11 @@ impl<'a> Search<'a> { // We create the original candidates with the facet conditions results. let facet_candidates = match self.facet_condition { Some(FacetCondition::Operator(fid, operator)) => { - use std::ops::Bound::{self, Included, Excluded}; use FacetOperator::*; - fn explore_facet_levels( - rtxn: &heed::RoTxn, - db: &heed::Database, - field_id: u8, - level: u8, - left: Bound, - right: Bound, - candidates: &mut RoaringBitmap, - ) -> anyhow::Result<()> - { - let mut left_found = left; - let mut right_found = right; - - let range = { - let left = match left { - Included(left) => Included((field_id, level, left, i64::MIN)), - Excluded(left) => Excluded((field_id, level, left, i64::MIN)), - Bound::Unbounded => Bound::Unbounded, - }; - let right = Included((field_id, level, i64::MAX, i64::MAX)); - (left, right) - }; - - for (i, result) in db.range(rtxn, &range)?.enumerate() { - let ((_fid, _level, l, r), docids) = result?; - match right { - Included(right) if r > right => break, - Excluded(right) if r >= right => break, - _ => (), - } - - eprintln!("{} to {} (level {})", l, r, _level); - candidates.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Excluded(l); } - right_found = Excluded(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - // TODO we must avoid going at deeper when the bounds are already satisfied. - explore_facet_levels(rtxn, db, field_id, deeper_level, left, left_found, candidates)?; - explore_facet_levels(rtxn, db, field_id, deeper_level, right_found, right, candidates)?; - - Ok(()) - } - - // Make sure we always bound the ranges with the field id, as the facets - // values are all in the same database and prefixed by the field id. + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. let (left, right) = match operator { GreaterThan(val) => (Excluded(val), Included(i64::MAX)), GreaterThanOrEqual(val) => (Included(val), Included(i64::MAX)), @@ -316,7 +353,7 @@ impl<'a> Search<'a> { // the first entry of it which corresponds to the last key of our field id. let db = db.remap_data_type::(); match db.get_lower_than(self.rtxn, &(next_fid, 0, i64::MIN, i64::MIN))? { - Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + Some(((id, level, ..), _)) if fid == id => Some(level), _ => None, } }, @@ -324,7 +361,7 @@ impl<'a> Search<'a> { // If we can't generate a bigger field id, it must be equal to 255 and // therefore the last key of the database must be the one we want. match db.remap_data_type::().last(self.rtxn)? { - Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + Some(((id, level, ..), _)) if fid == id => Some(level), _ => None, } }, @@ -332,9 +369,9 @@ impl<'a> Search<'a> { match biggest_level { Some(level) => { - let mut candidates = RoaringBitmap::new(); - explore_facet_levels(self.rtxn, &db, fid, level, left, right, &mut candidates)?; - Some(candidates) + let mut output = RoaringBitmap::new(); + self.explore_facet_levels(fid, level, left, right, &mut output)?; + Some(output) }, None => None, } @@ -342,6 +379,8 @@ impl<'a> Search<'a> { None => None, }; + debug!("facet candidates: {:?}", facet_candidates); + let (candidates, derived_words) = match (facet_candidates, derived_words) { (Some(mut facet_candidates), Some(derived_words)) => { let words_candidates = Self::compute_candidates(&derived_words); From ced0c29c5697e1241944c1d4f3ee34c7ef6e4602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Nov 2020 16:00:23 +0100 Subject: [PATCH 12/30] Simplify getting the biggest level of a facet field --- src/search.rs | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/search.rs b/src/search.rs index cedca1085..e7100e148 100644 --- a/src/search.rs +++ b/src/search.rs @@ -347,25 +347,12 @@ impl<'a> Search<'a> { .facet_field_id_value_docids .remap_key_type::(); - let biggest_level = match fid.checked_add(1) { - Some(next_fid) => { - // If we are able to find the next field id we ask the key that is before - // the first entry of it which corresponds to the last key of our field id. - let db = db.remap_data_type::(); - match db.get_lower_than(self.rtxn, &(next_fid, 0, i64::MIN, i64::MIN))? { - Some(((id, level, ..), _)) if fid == id => Some(level), - _ => None, - } - }, - None => { - // If we can't generate a bigger field id, it must be equal to 255 and - // therefore the last key of the database must be the one we want. - match db.remap_data_type::().last(self.rtxn)? { - Some(((id, level, ..), _)) if fid == id => Some(level), - _ => None, - } - }, - }; + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_data_type::() + .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, i64::MAX, i64::MAX))? + .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); match biggest_level { Some(level) => { From 9e2cbe3362710120833c7ba8491cdff3a9b0225c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Nov 2020 18:40:08 +0100 Subject: [PATCH 13/30] Improve the FacetLevelF64 serialization --- .../facet/facet_level_value_f64_codec.rs | 56 +++++++++++++------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/heed_codec/facet/facet_level_value_f64_codec.rs b/src/heed_codec/facet/facet_level_value_f64_codec.rs index 55fd4b132..1ee8e6bf3 100644 --- a/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -13,8 +13,14 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { let (field_id, bytes) = bytes.split_first()?; let (level, bytes) = bytes.split_first()?; - let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; - let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; + let (left, right) = if *level != 0 { + let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; + let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; + (left, right) + } else { + let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + (left, left) + }; Some((*field_id, *level, left, right)) } @@ -26,24 +32,38 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { let mut buffer = [0u8; 32]; - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); + let len = if *level != 0 { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); - let bytes = f64_into_bytes(*right)?; - buffer[8..16].copy_from_slice(&bytes[..]); + let bytes = f64_into_bytes(*right)?; + buffer[8..16].copy_from_slice(&bytes[..]); - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[16..24].copy_from_slice(&bytes[..]); + // Then the f64 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[16..24].copy_from_slice(&bytes[..]); - let bytes = right.to_be_bytes(); - buffer[24..].copy_from_slice(&bytes[..]); + let bytes = right.to_be_bytes(); + buffer[24..].copy_from_slice(&bytes[..]); - let mut bytes = Vec::with_capacity(buffer.len() + 2); + 32 // length + } else { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + + // Then the f64 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + 16 // length + }; + + let mut bytes = Vec::with_capacity(len + 2); bytes.push(*field_id); bytes.push(*level); - bytes.extend_from_slice(&buffer[..]); + bytes.extend_from_slice(&buffer[..len]); Some(Cow::Owned(bytes)) } } @@ -55,8 +75,12 @@ mod tests { #[test] fn globally_ordered_f64() { - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, -32.0, 32.0)).unwrap(); + let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 0, -32.0, 32.0)); + assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); + + let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); + let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); } } From 38c76754ef09ab709d58639a9761da637e66d236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Nov 2020 21:52:08 +0100 Subject: [PATCH 14/30] Make the facet level search system generic on f64 and i64 --- Cargo.lock | 5 +- Cargo.toml | 1 + src/search.rs | 123 ++++++++++++++++++++++++++----------- src/update/facet_levels.rs | 113 ++++++++++++++++++++++------------ 4 files changed, 166 insertions(+), 76 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f32d2f133..330588564 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -617,6 +617,7 @@ dependencies = [ "maplit", "memmap", "near-proximity", + "num-traits", "obkv", "once_cell", "ordered-float", @@ -675,9 +676,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", ] diff --git a/Cargo.toml b/Cargo.toml index bee4ebddc..2510cb245 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } +num-traits = "0.2.14" obkv = "0.1.0" once_cell = "1.4.0" ordered-float = "2.0.0" diff --git a/src/search.rs b/src/search.rs index e7100e148..e6fcefc62 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,7 +1,9 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; -use std::fmt; +use std::error::Error as StdError; +use std::fmt::{self, Debug}; use std::ops::Bound::{self, Unbounded, Included, Excluded}; +use std::str::FromStr; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; @@ -9,11 +11,12 @@ use heed::types::DecodeIgnore; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; +use num_traits::Bounded; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetLevelValueI64Codec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -24,20 +27,21 @@ static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); // TODO support also floats -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum FacetOperator { - GreaterThan(i64), - GreaterThanOrEqual(i64), - LowerThan(i64), - LowerThanOrEqual(i64), - Equal(i64), - Between(i64, i64), +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum FacetOperator { + GreaterThan(T), + GreaterThanOrEqual(T), + LowerThan(T), + LowerThanOrEqual(T), + Equal(T), + Between(T, T), } // TODO also support ANDs, ORs, NOTs. -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum FacetCondition { - Operator(u8, FacetOperator), + OperatorI64(u8, FacetOperator), + OperatorF64(u8, FacetOperator), } impl FacetCondition { @@ -48,7 +52,6 @@ impl FacetCondition { ) -> anyhow::Result> { use FacetCondition::*; - use FacetOperator::*; let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; @@ -64,33 +67,44 @@ impl FacetCondition { let field_id = fields_ids_map.id(&field_name).with_context(|| format!("field {} not found", field_name))?; let field_type = faceted_fields.get(&field_id).with_context(|| format!("field {} is not faceted", field_name))?; - ensure!(*field_type == FacetType::Integer, "Only conditions on integer facets"); + match field_type { + FacetType::Integer => Self::parse_condition(iter).map(|op| Some(OperatorI64(field_id, op))), + FacetType::Float => Self::parse_condition(iter).map(|op| Some(OperatorF64(field_id, op))), + FacetType::String => bail!("invalid facet type"), + } + } + fn parse_condition<'a, T: FromStr>( + mut iter: impl Iterator, + ) -> anyhow::Result> + where T::Err: Send + Sync + StdError + 'static, + { + use FacetOperator::*; match iter.next() { Some(">") => { let param = iter.next().context("missing parameter")?; let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Some(Operator(field_id, GreaterThan(value)))) + Ok(GreaterThan(value)) }, Some(">=") => { let param = iter.next().context("missing parameter")?; let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Some(Operator(field_id, GreaterThanOrEqual(value)))) + Ok(GreaterThanOrEqual(value)) }, Some("<") => { let param = iter.next().context("missing parameter")?; let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Some(Operator(field_id, LowerThan(value)))) + Ok(LowerThan(value)) }, Some("<=") => { let param = iter.next().context("missing parameter")?; let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Some(Operator(field_id, LowerThanOrEqual(value)))) + Ok(LowerThanOrEqual(value)) }, Some("=") => { let param = iter.next().context("missing parameter")?; let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Some(Operator(field_id, Equal(value)))) + Ok(Equal(value)) }, Some(otherwise) => { // BETWEEN or X TO Y (both inclusive) @@ -98,7 +112,7 @@ impl FacetCondition { ensure!(iter.next().map_or(false, |s| s.eq_ignore_ascii_case("to")), "TO keyword missing or invalid"); let next = iter.next().context("missing second TO parameter")?; let right_param = next.parse().with_context(|| format!("invalid second TO parameter ({:?})", next))?; - Ok(Some(Operator(field_id, Between(left_param, right_param)))) + Ok(Between(left_param, right_param)) }, None => bail!("missing facet filter first parameter"), } @@ -229,19 +243,23 @@ impl<'a> Search<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. - fn explore_facet_levels( + fn explore_facet_levels( &self, field_id: u8, level: u8, - left: Bound, - right: Bound, + left: Bound, + right: Bound, output: &mut RoaringBitmap, ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'a, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { - return self.explore_facet_levels(field_id, 0, left, right, output); + return self.explore_facet_levels::(field_id, 0, left, right, output); }, // lower TO upper when lower > upper must return no result (Included(l), Included(r)) if l > r => return Ok(()), @@ -257,12 +275,12 @@ impl<'a> Search<'a> { // We must create a custom iterator to be able to iterate over the // requested range as the range iterator cannot express some conditions. let left_bound = match left { - Included(left) => Included((field_id, level, left, i64::MIN)), - Excluded(left) => Excluded((field_id, level, left, i64::MIN)), + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), Unbounded => Unbounded, }; - let right_bound = Included((field_id, level, i64::MAX, i64::MAX)); - let db = self.index.facet_field_id_value_docids.remap_key_type::(); + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + let db = self.index.facet_field_id_value_docids.remap_key_type::(); let iter = db .range(self.rtxn, &(left_bound, right_bound))? .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { @@ -277,7 +295,7 @@ impl<'a> Search<'a> { for (i, result) in iter.enumerate() { let ((_fid, _level, l, r), docids) = result?; - debug!("{} to {} (level {}) found {} documents", l, r, _level, docids.len()); + debug!("{:?} to {:?} (level {}) found {} documents", l, r, _level, docids.len()); output.union_with(&docids); // We save the leftest and rightest bounds we actually found at this level. if i == 0 { left_found = Some(l); } @@ -298,18 +316,18 @@ impl<'a> Search<'a> { if !matches!(left, Included(l) if l == left_found) { let sub_right = Excluded(left_found); debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - self.explore_facet_levels(field_id, deeper_level, left, sub_right, output)?; + self.explore_facet_levels::(field_id, deeper_level, left, sub_right, output)?; } if !matches!(right, Included(r) if r == right_found) { let sub_left = Excluded(right_found); debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - self.explore_facet_levels(field_id, deeper_level, sub_left, right, output)?; + self.explore_facet_levels::(field_id, deeper_level, sub_left, right, output)?; } }, None => { // If we found nothing at this level it means that we must find // the same bounds but at a deeper, more precise level. - self.explore_facet_levels(field_id, deeper_level, left, right, output)?; + self.explore_facet_levels::(field_id, deeper_level, left, right, output)?; }, } @@ -327,10 +345,10 @@ impl<'a> Search<'a> { }; // We create the original candidates with the facet conditions results. + use FacetOperator::*; let facet_candidates = match self.facet_condition { - Some(FacetCondition::Operator(fid, operator)) => { - use FacetOperator::*; - + // TODO make that generic over floats and integers. + Some(FacetCondition::OperatorI64(fid, operator)) => { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. @@ -357,7 +375,40 @@ impl<'a> Search<'a> { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - self.explore_facet_levels(fid, level, left, right, &mut output)?; + self.explore_facet_levels::(fid, level, left, right, &mut output)?; + Some(output) + }, + None => None, + } + }, + Some(FacetCondition::OperatorF64(fid, operator)) => { + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), + LowerThan(val) => (Included(f64::MIN), Excluded(val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), + Equal(val) => (Included(val), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + let db = self.index + .facet_field_id_value_docids + .remap_key_type::(); + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_data_type::() + .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, f64::MAX, f64::MAX))? + .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + self.explore_facet_levels::(fid, level, left, right, &mut output)?; Some(output) }, None => None, diff --git a/src/update/facet_levels.rs b/src/update/facet_levels.rs index bc8f7121f..4a7769b7a 100644 --- a/src/update/facet_levels.rs +++ b/src/update/facet_levels.rs @@ -6,10 +6,12 @@ use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use itertools::Itertools; use log::debug; +use num_traits::{Bounded, Zero}; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; @@ -68,26 +70,47 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for (field_id, facet_type) in faceted_fields { - if facet_type == FacetType::String { continue } + let content = match facet_type { + FacetType::Integer => { + clear_field_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; - clear_field_levels( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; + compute_facet_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.last_level_size, + self.number_of_levels, + self.easing_function, + field_id, + )? + }, + FacetType::Float => { + clear_field_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; - let content = compute_facet_levels( - self.wtxn, - self.index.facet_field_id_value_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.last_level_size, - self.number_of_levels, - self.easing_function, - field_id, - facet_type, - )?; + compute_facet_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.last_level_size, + self.number_of_levels, + self.easing_function, + field_id, + )? + }, + FacetType::String => continue, + }; write_into_lmdb_database( self.wtxn, @@ -102,20 +125,26 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { } } -fn clear_field_levels( - wtxn: &mut heed::RwTxn, +fn clear_field_levels<'t, T: 't, KC>( + wtxn: &'t mut heed::RwTxn, db: heed::Database, field_id: u8, ) -> heed::Result<()> +where + T: Copy + Bounded, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let range = (field_id, 1, i64::MIN, i64::MIN)..=(field_id, u8::MAX, i64::MAX, i64::MAX); - db.remap_key_type::() + let left = (field_id, 1, T::min_value(), T::min_value()); + let right = (field_id, u8::MAX, T::max_value(), T::max_value()); + let range = left..=right; + db.remap_key_type::() .delete_range(wtxn, &range) .map(drop) } -fn compute_facet_levels( - rtxn: &heed::RoTxn, +fn compute_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, @@ -124,8 +153,11 @@ fn compute_facet_levels( number_of_levels: NonZeroUsize, easing_function: EasingName, field_id: u8, - facet_type: FacetType, ) -> anyhow::Result> +where + T: Copy + PartialEq + PartialOrd + Bounded + Zero, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { let first_level_size = db.prefix_iter(rtxn, &[field_id])? .remap_types::() @@ -137,7 +169,12 @@ fn compute_facet_levels( create_writer(compression_type, compression_level, file) })?; - let level_0_range = (field_id, 0, i64::MIN, i64::MIN)..=(field_id, 0, i64::MAX, i64::MAX); + let level_0_range = { + let left = (field_id, 0, T::min_value(), T::min_value()); + let right = (field_id, 0, T::max_value(), T::max_value()); + left..=right + }; + let level_sizes_iter = levels_iterator(first_level_size, last_level_size.get(), number_of_levels.get(), easing_function) .map(|size| (first_level_size as f64 / size as f64).ceil() as usize) @@ -147,13 +184,11 @@ fn compute_facet_levels( // TODO we must not create levels with identical group sizes. for (level, level_entry_sizes) in level_sizes_iter { - let mut left = 0; - let mut right = 0; + let mut left = T::zero(); + let mut right = T::zero(); let mut group_docids = RoaringBitmap::new(); - dbg!(level, level_entry_sizes, first_level_size); - - let db = db.remap_key_type::(); + let db = db.remap_key_type::(); for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { let ((_field_id, _level, value, _right), docids) = result?; @@ -162,7 +197,7 @@ fn compute_facet_levels( } else if i % level_entry_sizes == 0 { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. - write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + write_entry::(&mut writer, field_id, level as u8, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); @@ -175,24 +210,26 @@ fn compute_facet_levels( } if !group_docids.is_empty() { - write_entry(&mut writer, field_id, level as u8, left, right, &group_docids)?; + write_entry::(&mut writer, field_id, level as u8, left, right, &group_docids)?; } } writer_into_reader(writer, shrink_size) } -fn write_entry( +fn write_entry( writer: &mut Writer, field_id: u8, level: u8, - left: i64, - right: i64, + left: T, + right: T, ids: &RoaringBitmap, ) -> anyhow::Result<()> +where + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { let key = (field_id, level, left, right); - let key = FacetLevelValueI64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) From 0694cc49164e34ae97e553e9605fd49fa69f749b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 11:17:53 +0100 Subject: [PATCH 15/30] Drastically speed up documents deletion updates --- src/update/delete_documents.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 1913ac033..9924080f2 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -1,4 +1,5 @@ use fst::IntoStreamer; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; @@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; if let Some((key, mut docids)) = iter.next().transpose()? { if key == word.as_ref() { + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; *must_remove = true; - } else { + } else if docids.len() != previous_len { iter.put_current(key, &docids)?; } } @@ -168,14 +170,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. - let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?; + let mut iter = word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; while let Some(result) = iter.next() { - let ((w1, w2, prox), mut docids) = result?; + let (bytes, mut docids) = result?; + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; - } else { - iter.put_current(&(w1, w2, prox), &docids)?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; } } @@ -185,10 +188,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (bytes, mut docids) = result?; + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; - } else { + } else if docids.len() != previous_len { iter.put_current(bytes, &docids)?; } } From 59ca4b9fe496c1cb83cdfa74ca63bebb41a7e8c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 11:18:52 +0100 Subject: [PATCH 16/30] Introduce a little bit of debug when deleting documents --- src/update/index_documents/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 50f5336fc..d48696e94 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -158,7 +158,7 @@ pub fn write_into_lmdb_database( match iter.next().transpose()? { Some((key, old_val)) if key == k => { let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).expect("merge failed"); + let val = merge(k, &vals)?; iter.put_current(k, &val)?; }, _ => { @@ -313,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { thread_pool: self.thread_pool, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; + debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); - let _deleted_documents_count = deletion_builder.execute()?; + let deleted_documents_count = deletion_builder.execute()?; + debug!("{} documents actually deleted", deleted_documents_count); } let mmap; From 07a0c827907241099b1aacfaadc24b0606920b08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 15:53:13 +0100 Subject: [PATCH 17/30] Bump heed to 0.10.4 to use be able to lazily decode roaring bitmaps --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- src/search.rs | 14 +++++++++----- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 330588564..884bc19d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d2740ccbbfb2a6e6ff0c43e0fc14981ed668fb45be5a4e7b2bc03fc8cca3d3e" +checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" dependencies = [ "byteorder", "heed-traits", diff --git a/Cargo.toml b/Cargo.toml index 2510cb245..b77cf4a44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ flate2 = "1.0.17" fst = "0.4.4" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = { version = "0.10.3", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index e8d30a5ce..b30fb95c2 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.28" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = "0.10.3" +heed = "0.10.4" memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" diff --git a/src/search.rs b/src/search.rs index e6fcefc62..3a781847f 100644 --- a/src/search.rs +++ b/src/search.rs @@ -280,8 +280,11 @@ impl<'a> Search<'a> { Unbounded => Unbounded, }; let right_bound = Included((field_id, level, T::max_value(), T::max_value())); - let db = self.index.facet_field_id_value_docids.remap_key_type::(); - let iter = db + // We also make sure that we don't decode the data before we are sure we must return it. + let iter = self.index + .facet_field_id_value_docids + .remap_key_type::() + .lazily_decode_data() .range(self.rtxn, &(left_bound, right_bound))? .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { match right { @@ -289,13 +292,14 @@ impl<'a> Search<'a> { Excluded(right) => *r < right, Unbounded => true, } - })); + })) + .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); debug!("Iterating between {:?} and {:?} (level {})", left, right, level); for (i, result) in iter.enumerate() { - let ((_fid, _level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, _level, docids.len()); + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); output.union_with(&docids); // We save the leftest and rightest bounds we actually found at this level. if i == 0 { left_found = Some(l); } From d40dd3e4dab98e820ba9c995ca58ad1e1c555717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 16:39:59 +0100 Subject: [PATCH 18/30] Reduce the amount of duplicated code to iterate over facet values --- src/subcommand/infos.rs | 93 ++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 3b9c174b8..4153b97b4 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -1,5 +1,5 @@ use std::path::PathBuf; -use std::{str, io}; +use std::{str, io, fmt}; use anyhow::Context; use heed::EnvOpenOptions; @@ -232,12 +232,17 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: Ok(wtr.flush()?) } -fn facet_values_iter<'txn, DC: 'txn>( +/// Helper function that converts the facet value key to a unique type +/// that can be used to log or display purposes. +fn facet_values_iter<'txn, DC: 'txn, T>( rtxn: &'txn heed::RoTxn, db: heed::Database, field_id: u8, facet_type: crate::facet::FacetType, -) -> heed::Result> + 'txn>> + string_fn: impl Fn(&str) -> T + 'txn, + float_fn: impl Fn(u8, f64, f64) -> T + 'txn, + integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, +) -> heed::Result> + 'txn>> where DC: heed::BytesDecode<'txn>, { @@ -250,30 +255,34 @@ where match facet_type { FacetType::String => { let iter = iter.remap_key_type::() - .map(|r| r.map(|((_, key), value)| (key.to_string(), value))); + .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); Ok(Box::new(iter) as Box>) }, FacetType::Float => { let iter = iter.remap_key_type::() - .map(|r| r.map(|((_, level, left, right), value)| if level == 0 { - (format!("{} (level {})", left, level), value) - } else { - (format!("{} to {} (level {})", left, right, level), value) + .map(move |r| r.map(|((_, level, left, right), value)| { + (float_fn(level, left, right), value) })); Ok(Box::new(iter)) }, FacetType::Integer => { let iter = iter.remap_key_type::() - .map(|r| r.map(|((_, level, left, right), value)| if level == 0 { - (format!("{} (level {})", left, level), value) - } else { - (format!("{} to {} (level {})", left, right, level), value) + .map(move |r| r.map(|((_, level, left, right), value)| { + (integer_fn(level, left, right), value) })); Ok(Box::new(iter)) }, } } +fn facet_number_value_to_string(level: u8, left: T, right: T) -> String { + if level == 0 { + format!("{:?} (level {})", left, level) + } else { + format!("{:?} to {:?} (level {})", left, right, level) + } +} + fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::cmp::Reverse; use std::collections::BinaryHeap; @@ -334,7 +343,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let facet_name = fields_ids_map.name(field_id).unwrap(); let db = facet_field_id_value_docids.remap_data_type::(); - for result in facet_values_iter(rtxn, db, field_id, field_type)? { + let iter = facet_values_iter( + rtxn, + db, + field_id, + field_type, + |key| key.to_owned(), + facet_number_value_to_string, + facet_number_value_to_string, + )?; + + for result in iter { let (fvalue, value) = result?; let key = format!("{} {}", facet_name, fvalue); heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); @@ -393,7 +412,17 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam wtr.write_record(&["facet_value", "documents_ids"])?; let db = index.facet_field_id_value_docids; - for result in facet_values_iter(rtxn, db, field_id, *field_type)? { + let iter = facet_values_iter( + rtxn, + db, + field_id, + *field_type, + |key| key.to_owned(), + facet_number_value_to_string, + facet_number_value_to_string, + )?; + + for result in iter { let (value, docids) = result?; let docids = if debug { format!("{:?}", docids) @@ -407,12 +436,6 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam } fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { - use heed::types::ByteSlice; - use crate::facet::FacetType; - use crate::heed_codec::facet::{ - FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, - }; - let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields(&rtxn)?; @@ -421,31 +444,23 @@ fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow: let field_type = faceted_fields.get(&field_id) .with_context(|| format!("field {} is not faceted", field_name))?; - let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; - let iter = match field_type { - FacetType::String => { - let iter = iter.remap_types::() - .map(|r| r.map(|_| 0u8)); - Box::new(iter) as Box> - }, - FacetType::Float => { - let iter = iter.remap_types::() - .map(|r| r.map(|((_, level, _, _), _)| level)); - Box::new(iter) - }, - FacetType::Integer => { - let iter = iter.remap_types::() - .map(|r| r.map(|((_, level, _, _), _)| level)); - Box::new(iter) - }, - }; + let db = index.facet_field_id_value_docids; + let iter = facet_values_iter( + rtxn, + db, + field_id, + *field_type, + |_key| 0u8, + |level, _left, _right| level, + |level, _left, _right| level, + )?; println!("The database {:?} facet stats", field_name); let mut level_size = 0; let mut current_level = None; for result in iter { - let level = result?; + let (level, _) = result?; if let Some(current) = current_level { if current != level { println!("\tnumber of groups at level {}: {}", current, level_size); From 531bd6ddc7d72e415c298cabb880b8198e3459cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 19:28:20 +0100 Subject: [PATCH 19/30] Make the facet operator evaluation code generic --- src/search.rs | 326 +++++++++++++++++++++++++------------------------- 1 file changed, 160 insertions(+), 166 deletions(-) diff --git a/src/search.rs b/src/search.rs index 3a781847f..5cd998ffe 100644 --- a/src/search.rs +++ b/src/search.rs @@ -7,7 +7,7 @@ use std::str::FromStr; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; +use heed::types::{ByteSlice, DecodeIgnore}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; @@ -17,6 +17,7 @@ use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::heed_codec::CboRoaringBitmapCodec; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -80,6 +81,7 @@ impl FacetCondition { where T::Err: Send + Sync + StdError + 'static, { use FacetOperator::*; + match iter.next() { Some(">") => { let param = iter.next().context("missing parameter")?; @@ -117,6 +119,161 @@ impl FacetCondition { None => bail!("missing facet filter first parameter"), } } + + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + // We also make sure that we don't decode the data before we are sure we must return it. + let iter = db + .remap_key_type::() + .lazily_decode_data() + .range(rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })) + .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + + fn evaluate_operator<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + operator: FacetOperator, + ) -> anyhow::Result + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + use FacetOperator::*; + + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(T::max_value())), + GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + LowerThan(val) => (Included(T::min_value()), Excluded(val)), + LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), + Equal(val) => (Included(val), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_types::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Ok(output) + }, + None => Ok(RoaringBitmap::new()), + } + } + + fn evaluate( + &self, + rtxn: &heed::RoTxn, + db: heed::Database, + ) -> anyhow::Result + { + match *self { + FacetCondition::OperatorI64(fid, operator) => { + Self::evaluate_operator::(rtxn, db, fid, operator) + }, + FacetCondition::OperatorF64(fid, operator) => { + Self::evaluate_operator::(rtxn, db, fid, operator) + } + } + } } pub struct Search<'a> { @@ -241,103 +398,6 @@ impl<'a> Search<'a> { candidates } - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_levels( - &self, - field_id: u8, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> anyhow::Result<()> - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'a, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, - { - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return self.explore_facet_levels::(field_id, 0, left, right, output); - }, - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let left_bound = match left { - Included(left) => Included((field_id, level, left, T::min_value())), - Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Unbounded, - }; - let right_bound = Included((field_id, level, T::max_value(), T::max_value())); - // We also make sure that we don't decode the data before we are sure we must return it. - let iter = self.index - .facet_field_id_value_docids - .remap_key_type::() - .lazily_decode_data() - .range(self.rtxn, &(left_bound, right_bound))? - .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { - match right { - Included(right) => *r <= right, - Excluded(right) => *r < right, - Unbounded => true, - } - })) - .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - output.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - self.explore_facet_levels::(field_id, deeper_level, left, sub_right, output)?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - self.explore_facet_levels::(field_id, deeper_level, sub_left, right, output)?; - } - }, - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - self.explore_facet_levels::(field_id, deeper_level, left, right, output)?; - }, - } - - Ok(()) - } - pub fn execute(&self) -> anyhow::Result { let limit = self.limit; let fst = self.index.words_fst(self.rtxn)?; @@ -349,75 +409,9 @@ impl<'a> Search<'a> { }; // We create the original candidates with the facet conditions results. - use FacetOperator::*; + let facet_db = self.index.facet_field_id_value_docids; let facet_candidates = match self.facet_condition { - // TODO make that generic over floats and integers. - Some(FacetCondition::OperatorI64(fid, operator)) => { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(i64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(i64::MAX)), - LowerThan(val) => (Included(i64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(i64::MIN), Included(val)), - Equal(val) => (Included(val), Included(val)), - Between(left, right) => (Included(left), Included(right)), - }; - - let db = self.index - .facet_field_id_value_docids - .remap_key_type::(); - - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_data_type::() - .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, i64::MAX, i64::MAX))? - .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - self.explore_facet_levels::(fid, level, left, right, &mut output)?; - Some(output) - }, - None => None, - } - }, - Some(FacetCondition::OperatorF64(fid, operator)) => { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(f64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), - LowerThan(val) => (Included(f64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), - Equal(val) => (Included(val), Included(val)), - Between(left, right) => (Included(left), Included(right)), - }; - - let db = self.index - .facet_field_id_value_docids - .remap_key_type::(); - - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_data_type::() - .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - self.explore_facet_levels::(fid, level, left, right, &mut output)?; - Some(output) - }, - None => None, - } - }, + Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), None => None, }; From 278391d961f2e06147ff1aa01362da1bf7e2b980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 20 Nov 2020 10:54:41 +0100 Subject: [PATCH 20/30] Move the facets related system into the new search module --- src/{search.rs => search/facet.rs} | 235 +---------------------------- src/search/mod.rs | 228 ++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 229 deletions(-) rename src/{search.rs => search/facet.rs} (55%) create mode 100644 src/search/mod.rs diff --git a/src/search.rs b/src/search/facet.rs similarity index 55% rename from src/search.rs rename to src/search/facet.rs index 5cd998ffe..22352ab48 100644 --- a/src/search.rs +++ b/src/search/facet.rs @@ -1,33 +1,21 @@ -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; use std::error::Error as StdError; -use std::fmt::{self, Debug}; +use std::fmt::Debug; use std::ops::Bound::{self, Unbounded, Included, Excluded}; use std::str::FromStr; use anyhow::{bail, ensure, Context}; -use fst::{IntoStreamer, Streamer}; use heed::types::{ByteSlice, DecodeIgnore}; -use levenshtein_automata::DFA; -use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use num_traits::Bounded; -use once_cell::sync::Lazy; -use roaring::bitmap::RoaringBitmap; +use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::mdfs::Mdfs; -use crate::query_tokens::{QueryTokens, QueryToken}; -use crate::{Index, DocumentId}; +use crate::{Index, CboRoaringBitmapCodec}; -// Building these factories is not free. -static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); -static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); -static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); +use self::FacetCondition::*; +use self::FacetOperator::*; -// TODO support also floats #[derive(Debug, Copy, Clone, PartialEq)] pub enum FacetOperator { GreaterThan(T), @@ -52,8 +40,6 @@ impl FacetCondition { string: &str, ) -> anyhow::Result> { - use FacetCondition::*; - let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; @@ -80,8 +66,6 @@ impl FacetCondition { ) -> anyhow::Result> where T::Err: Send + Sync + StdError + 'static, { - use FacetOperator::*; - match iter.next() { Some(">") => { let param = iter.next().context("missing parameter")?; @@ -228,8 +212,6 @@ impl FacetCondition { KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - use FacetOperator::*; - // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. @@ -259,7 +241,7 @@ impl FacetCondition { } } - fn evaluate( + pub fn evaluate( &self, rtxn: &heed::RoTxn, db: heed::Database, @@ -275,208 +257,3 @@ impl FacetCondition { } } } - -pub struct Search<'a> { - query: Option, - facet_condition: Option, - offset: usize, - limit: usize, - rtxn: &'a heed::RoTxn<'a>, - index: &'a Index, -} - -impl<'a> Search<'a> { - pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { - Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } - } - - pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { - self.query = Some(query.into()); - self - } - - pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { - self.offset = offset; - self - } - - pub fn limit(&mut self, limit: usize) -> &mut Search<'a> { - self.limit = limit; - self - } - - pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { - self.facet_condition = Some(condition); - self - } - - /// Extracts the query words from the query string and returns the DFAs accordingly. - /// TODO introduce settings for the number of typos regarding the words lengths. - fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { - let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - - let words: Vec<_> = QueryTokens::new(query).collect(); - let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let number_of_words = words.len(); - - words.into_iter().enumerate().map(|(i, word)| { - let (word, quoted) = match word { - QueryToken::Free(word) => (word.to_lowercase(), word.len() <= 3), - QueryToken::Quoted(word) => (word.to_lowercase(), true), - }; - let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace && !quoted; - let lev = match word.len() { - 0..=4 => if quoted { lev0 } else { lev0 }, - 5..=8 => if quoted { lev0 } else { lev1 }, - _ => if quoted { lev0 } else { lev2 }, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - - (word, is_prefix, dfa) - }) - .collect() - } - - /// Fetch the words from the given FST related to the given DFAs along with - /// the associated documents ids. - fn fetch_words_docids( - &self, - fst: &fst::Set>, - dfas: Vec<(String, bool, DFA)>, - ) -> anyhow::Result, RoaringBitmap)>> - { - // A Vec storing all the derived words from the original query words, associated - // with the distance from the original word and the docids where the words appears. - let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); - - for (_word, _is_prefix, dfa) in dfas { - - let mut acc_derived_words = HashMap::new(); - let mut unions_docids = RoaringBitmap::new(); - let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - - let word = std::str::from_utf8(word)?; - let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); - let distance = dfa.distance(state); - unions_docids.union_with(&docids); - acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); - } - derived_words.push((acc_derived_words, unions_docids)); - } - - Ok(derived_words) - } - - /// Returns the set of docids that contains all of the query words. - fn compute_candidates( - derived_words: &[(HashMap, RoaringBitmap)], - ) -> RoaringBitmap - { - // We sort the derived words by inverse popularity, this way intersections are faster. - let mut derived_words: Vec<_> = derived_words.iter().collect(); - derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); - - // we do a union between all the docids of each of the derived words, - // we got N unions (the number of original query words), we then intersect them. - let mut candidates = RoaringBitmap::new(); - - for (i, (_, union_docids)) in derived_words.iter().enumerate() { - if i == 0 { - candidates = union_docids.clone(); - } else { - candidates.intersect_with(&union_docids); - } - } - - candidates - } - - pub fn execute(&self) -> anyhow::Result { - let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; - - // Construct the DFAs related to the query words. - let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), - _otherwise => None, - }; - - // We create the original candidates with the facet conditions results. - let facet_db = self.index.facet_field_id_value_docids; - let facet_candidates = match self.facet_condition { - Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), - None => None, - }; - - debug!("facet candidates: {:?}", facet_candidates); - - let (candidates, derived_words) = match (facet_candidates, derived_words) { - (Some(mut facet_candidates), Some(derived_words)) => { - let words_candidates = Self::compute_candidates(&derived_words); - facet_candidates.intersect_with(&words_candidates); - (facet_candidates, derived_words) - }, - (None, Some(derived_words)) => { - (Self::compute_candidates(&derived_words), derived_words) - }, - (Some(facet_candidates), None) => { - // If the query is not set or results in no DFAs but - // there is some facet conditions we return a placeholder. - let documents_ids = facet_candidates.iter().take(limit).collect(); - return Ok(SearchResult { documents_ids, ..Default::default() }) - }, - (None, None) => { - // If the query is not set or results in no DFAs we return a placeholder. - let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); - return Ok(SearchResult { documents_ids, ..Default::default() }) - }, - }; - - debug!("candidates: {:?}", candidates); - - // The mana depth first search is a revised DFS that explore - // solutions in the order of their proximities. - let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates); - let mut documents = Vec::new(); - - // We execute the Mdfs iterator until we find enough documents. - while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - match mdfs.next().transpose()? { - Some((proximity, answer)) => { - debug!("answer with a proximity of {}: {:?}", proximity, answer); - documents.push(answer); - }, - None => break, - } - } - - let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = documents.into_iter().flatten().take(limit).collect(); - Ok(SearchResult { found_words, documents_ids }) - } -} - -impl fmt::Debug for Search<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Search") - .field("query", &self.query) - .field("facet_condition", &self.facet_condition) - .field("offset", &self.offset) - .field("limit", &self.limit) - .finish() - } -} - -#[derive(Default)] -pub struct SearchResult { - pub found_words: HashSet, - // TODO those documents ids should be associated with their criteria scores. - pub documents_ids: Vec, -} diff --git a/src/search/mod.rs b/src/search/mod.rs new file mode 100644 index 000000000..8ee8461a8 --- /dev/null +++ b/src/search/mod.rs @@ -0,0 +1,228 @@ +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::fmt; + +use fst::{IntoStreamer, Streamer}; +use levenshtein_automata::DFA; +use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use log::debug; +use once_cell::sync::Lazy; +use roaring::bitmap::RoaringBitmap; + +use crate::mdfs::Mdfs; +use crate::query_tokens::{QueryTokens, QueryToken}; +use crate::{Index, DocumentId}; + +pub use self::facet::FacetCondition; + +// Building these factories is not free. +static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); +static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); +static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); + +mod facet; + +pub struct Search<'a> { + query: Option, + facet_condition: Option, + offset: usize, + limit: usize, + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, +} + +impl<'a> Search<'a> { + pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { + Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } + } + + pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { + self.query = Some(query.into()); + self + } + + pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { + self.offset = offset; + self + } + + pub fn limit(&mut self, limit: usize) -> &mut Search<'a> { + self.limit = limit; + self + } + + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { + self.facet_condition = Some(condition); + self + } + + /// Extracts the query words from the query string and returns the DFAs accordingly. + /// TODO introduce settings for the number of typos regarding the words lengths. + fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { + let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); + + let words: Vec<_> = QueryTokens::new(query).collect(); + let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let number_of_words = words.len(); + + words.into_iter().enumerate().map(|(i, word)| { + let (word, quoted) = match word { + QueryToken::Free(word) => (word.to_lowercase(), word.len() <= 3), + QueryToken::Quoted(word) => (word.to_lowercase(), true), + }; + let is_last = i + 1 == number_of_words; + let is_prefix = is_last && !ends_with_whitespace && !quoted; + let lev = match word.len() { + 0..=4 => if quoted { lev0 } else { lev0 }, + 5..=8 => if quoted { lev0 } else { lev1 }, + _ => if quoted { lev0 } else { lev2 }, + }; + + let dfa = if is_prefix { + lev.build_prefix_dfa(&word) + } else { + lev.build_dfa(&word) + }; + + (word, is_prefix, dfa) + }) + .collect() + } + + /// Fetch the words from the given FST related to the given DFAs along with + /// the associated documents ids. + fn fetch_words_docids( + &self, + fst: &fst::Set>, + dfas: Vec<(String, bool, DFA)>, + ) -> anyhow::Result, RoaringBitmap)>> + { + // A Vec storing all the derived words from the original query words, associated + // with the distance from the original word and the docids where the words appears. + let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); + + for (_word, _is_prefix, dfa) in dfas { + + let mut acc_derived_words = HashMap::new(); + let mut unions_docids = RoaringBitmap::new(); + let mut stream = fst.search_with_state(&dfa).into_stream(); + while let Some((word, state)) = stream.next() { + + let word = std::str::from_utf8(word)?; + let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); + let distance = dfa.distance(state); + unions_docids.union_with(&docids); + acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); + } + derived_words.push((acc_derived_words, unions_docids)); + } + + Ok(derived_words) + } + + /// Returns the set of docids that contains all of the query words. + fn compute_candidates( + derived_words: &[(HashMap, RoaringBitmap)], + ) -> RoaringBitmap + { + // We sort the derived words by inverse popularity, this way intersections are faster. + let mut derived_words: Vec<_> = derived_words.iter().collect(); + derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); + + // we do a union between all the docids of each of the derived words, + // we got N unions (the number of original query words), we then intersect them. + let mut candidates = RoaringBitmap::new(); + + for (i, (_, union_docids)) in derived_words.iter().enumerate() { + if i == 0 { + candidates = union_docids.clone(); + } else { + candidates.intersect_with(&union_docids); + } + } + + candidates + } + + pub fn execute(&self) -> anyhow::Result { + let limit = self.limit; + let fst = self.index.words_fst(self.rtxn)?; + + // Construct the DFAs related to the query words. + let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { + Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), + _otherwise => None, + }; + + // We create the original candidates with the facet conditions results. + let facet_db = self.index.facet_field_id_value_docids; + let facet_candidates = match self.facet_condition { + Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), + None => None, + }; + + debug!("facet candidates: {:?}", facet_candidates); + + let (candidates, derived_words) = match (facet_candidates, derived_words) { + (Some(mut facet_candidates), Some(derived_words)) => { + let words_candidates = Self::compute_candidates(&derived_words); + facet_candidates.intersect_with(&words_candidates); + (facet_candidates, derived_words) + }, + (None, Some(derived_words)) => { + (Self::compute_candidates(&derived_words), derived_words) + }, + (Some(facet_candidates), None) => { + // If the query is not set or results in no DFAs but + // there is some facet conditions we return a placeholder. + let documents_ids = facet_candidates.iter().take(limit).collect(); + return Ok(SearchResult { documents_ids, ..Default::default() }) + }, + (None, None) => { + // If the query is not set or results in no DFAs we return a placeholder. + let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); + return Ok(SearchResult { documents_ids, ..Default::default() }) + }, + }; + + debug!("candidates: {:?}", candidates); + + // The mana depth first search is a revised DFS that explore + // solutions in the order of their proximities. + let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates); + let mut documents = Vec::new(); + + // We execute the Mdfs iterator until we find enough documents. + while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { + match mdfs.next().transpose()? { + Some((proximity, answer)) => { + debug!("answer with a proximity of {}: {:?}", proximity, answer); + documents.push(answer); + }, + None => break, + } + } + + let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); + let documents_ids = documents.into_iter().flatten().take(limit).collect(); + Ok(SearchResult { found_words, documents_ids }) + } +} + +impl fmt::Debug for Search<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Search") + .field("query", &self.query) + .field("facet_condition", &self.facet_condition) + .field("offset", &self.offset) + .field("limit", &self.limit) + .finish() + } +} + +#[derive(Default)] +pub struct SearchResult { + pub found_words: HashSet, + // TODO those documents ids should be associated with their criteria scores. + pub documents_ids: Vec, +} From 498f0d8539c20ea4c314afd67387d6a113164cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 20 Nov 2020 12:09:21 +0100 Subject: [PATCH 21/30] Output the documents count for each facet value in the infos subcommand --- src/subcommand/infos.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 4153b97b4..f8138660b 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -409,7 +409,7 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["facet_value", "documents_ids"])?; + wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?; let db = index.facet_field_id_value_docids; let iter = facet_values_iter( @@ -424,12 +424,13 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam for result in iter { let (value, docids) = result?; + let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value, docids])?; + wtr.write_record(&[value, count.to_string(), docids])?; } Ok(wtr.flush()?) From c52d09d5b12584141846500fb0009f3d91a80a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 20 Nov 2020 12:59:29 +0100 Subject: [PATCH 22/30] Support a basic version of the string facet query system --- src/search/facet.rs | 84 ++++++++++++++++++++++------- src/search/mod.rs | 2 +- src/update/index_documents/store.rs | 2 +- 3 files changed, 68 insertions(+), 20 deletions(-) diff --git a/src/search/facet.rs b/src/search/facet.rs index 22352ab48..08daf5fbc 100644 --- a/src/search/facet.rs +++ b/src/search/facet.rs @@ -5,19 +5,21 @@ use std::str::FromStr; use anyhow::{bail, ensure, Context}; use heed::types::{ByteSlice, DecodeIgnore}; +use itertools::Itertools; use log::debug; use num_traits::Bounded; use roaring::RoaringBitmap; use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; use crate::{Index, CboRoaringBitmapCodec}; use self::FacetCondition::*; -use self::FacetOperator::*; +use self::FacetNumberOperator::*; #[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetOperator { +pub enum FacetNumberOperator { GreaterThan(T), GreaterThanOrEqual(T), LowerThan(T), @@ -26,11 +28,17 @@ pub enum FacetOperator { Between(T, T), } +#[derive(Debug, Clone, PartialEq)] +pub enum FacetStringOperator { + Equal(String), +} + // TODO also support ANDs, ORs, NOTs. -#[derive(Debug, Copy, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq)] pub enum FacetCondition { - OperatorI64(u8, FacetOperator), - OperatorF64(u8, FacetOperator), + OperatorI64(u8, FacetNumberOperator), + OperatorF64(u8, FacetNumberOperator), + OperatorString(u8, FacetStringOperator), } impl FacetCondition { @@ -55,15 +63,34 @@ impl FacetCondition { let field_type = faceted_fields.get(&field_id).with_context(|| format!("field {} is not faceted", field_name))?; match field_type { - FacetType::Integer => Self::parse_condition(iter).map(|op| Some(OperatorI64(field_id, op))), - FacetType::Float => Self::parse_condition(iter).map(|op| Some(OperatorF64(field_id, op))), - FacetType::String => bail!("invalid facet type"), + FacetType::Integer => Self::parse_number_condition(iter).map(|op| Some(OperatorI64(field_id, op))), + FacetType::Float => Self::parse_number_condition(iter).map(|op| Some(OperatorF64(field_id, op))), + FacetType::String => Self::parse_string_condition(iter).map(|op| Some(OperatorString(field_id, op))), } } - fn parse_condition<'a, T: FromStr>( + fn parse_string_condition<'a>( mut iter: impl Iterator, - ) -> anyhow::Result> + ) -> anyhow::Result + { + match iter.next() { + Some("=") | Some(":") => { + match iter.next() { + Some(q @ "\"") | Some(q @ "\'") => { + let string: String = iter.take_while(|&c| c != q).intersperse(" ").collect(); + Ok(FacetStringOperator::Equal(string.to_lowercase())) + }, + Some(param) => Ok(FacetStringOperator::Equal(param.to_lowercase())), + None => bail!("missing parameter"), + } + }, + _ => bail!("invalid facet string operator"), + } + } + + fn parse_number_condition<'a, T: FromStr>( + mut iter: impl Iterator, + ) -> anyhow::Result> where T::Err: Send + Sync + StdError + 'static, { match iter.next() { @@ -201,11 +228,11 @@ impl FacetCondition { Ok(()) } - fn evaluate_operator<'t, T: 't, KC>( + fn evaluate_number_operator<'t, T: 't, KC>( rtxn: &'t heed::RoTxn, db: heed::Database, field_id: u8, - operator: FacetOperator, + operator: FacetNumberOperator, ) -> anyhow::Result where T: Copy + PartialEq + PartialOrd + Bounded + Debug, @@ -241,19 +268,40 @@ impl FacetCondition { } } + fn evaluate_string_operator( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: u8, + operator: &FacetStringOperator, + ) -> anyhow::Result + { + match operator { + FacetStringOperator::Equal(string) => { + match db.get(rtxn, &(field_id, string))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()) + } + } + } + } + pub fn evaluate( &self, rtxn: &heed::RoTxn, db: heed::Database, ) -> anyhow::Result { - match *self { - FacetCondition::OperatorI64(fid, operator) => { - Self::evaluate_operator::(rtxn, db, fid, operator) + match self { + OperatorI64(fid, op) => { + Self::evaluate_number_operator::(rtxn, db, *fid, *op) + }, + OperatorF64(fid, op) => { + Self::evaluate_number_operator::(rtxn, db, *fid, *op) + }, + OperatorString(fid, op) => { + let db = db.remap_key_type::(); + Self::evaluate_string_operator(rtxn, db, *fid, op) }, - FacetCondition::OperatorF64(fid, operator) => { - Self::evaluate_operator::(rtxn, db, fid, operator) - } } } } diff --git a/src/search/mod.rs b/src/search/mod.rs index 8ee8461a8..d236e396a 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -156,7 +156,7 @@ impl<'a> Search<'a> { // We create the original candidates with the facet conditions results. let facet_db = self.index.facet_field_id_value_docids; - let facet_candidates = match self.facet_condition { + let facet_candidates = match &self.facet_condition { Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), None => None, }; diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 289704b1a..6fb07b345 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -586,7 +586,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result { - let string = string.trim(); + let string = string.trim().to_lowercase(); if string.is_empty() { return Ok(()) } match ftype { FacetType::String => { From a0adfb5e8ea9b200f0d331606948c3d2e2aa0e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 21 Nov 2020 13:09:49 +0100 Subject: [PATCH 23/30] Introduce a real pest parser and support every facet filter conditions --- Cargo.lock | 134 ++++++++++ Cargo.toml | 4 + http-ui/Cargo.lock | 70 ++++- http-ui/src/main.rs | 3 +- src/lib.rs | 2 + src/search/facet.rs | 307 ---------------------- src/search/facet/grammar.pest | 29 +++ src/search/facet/mod.rs | 476 ++++++++++++++++++++++++++++++++++ src/search/facet/parser.rs | 12 + src/search/mod.rs | 3 +- 10 files changed, 728 insertions(+), 312 deletions(-) delete mode 100644 src/search/facet.rs create mode 100644 src/search/facet/grammar.pest create mode 100644 src/search/facet/mod.rs create mode 100644 src/search/facet/parser.rs diff --git a/Cargo.lock b/Cargo.lock index 884bc19d9..70128cfa9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,6 +45,27 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder", + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", +] + [[package]] name = "bstr" version = "0.2.13" @@ -63,6 +84,12 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + [[package]] name = "byteorder" version = "1.3.4" @@ -285,12 +312,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array", +] + [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + [[package]] name = "flate2" version = "1.0.17" @@ -324,6 +366,15 @@ dependencies = [ "byteorder", ] +[[package]] +name = "generic-array" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" +dependencies = [ + "typenum", +] + [[package]] name = "getrandom" version = "0.1.14" @@ -621,6 +672,8 @@ dependencies = [ "obkv", "once_cell", "ordered-float", + "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", + "pest_derive", "rayon", "ringtail", "roaring", @@ -717,6 +770,12 @@ version = "11.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + [[package]] name = "ordered-float" version = "2.0.0" @@ -742,6 +801,57 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sha-1", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -1026,6 +1136,18 @@ dependencies = [ "serde", ] +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer", + "digest", + "fake-simd", + "opaque-debug", +] + [[package]] name = "slice-group-by" version = "0.2.6" @@ -1234,6 +1356,18 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +[[package]] +name = "typenum" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" + +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unicode-bidi" version = "0.3.4" diff --git a/Cargo.toml b/Cargo.toml index b77cf4a44..37c83b4f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he tempfile = "3.1.0" uuid = { version = "0.8.1", features = ["v4"] } +# facet filter parser +pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } +pest_derive = "2.1.0" + # documents words self-join itertools = "0.9.0" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 162ca96b2..b15700ce5 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -654,9 +654,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d2740ccbbfb2a6e6ff0c43e0fc14981ed668fb45be5a4e7b2bc03fc8cca3d3e" +checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" dependencies = [ "byteorder", "heed-traits", @@ -934,6 +934,12 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matches" version = "0.1.8" @@ -987,9 +993,12 @@ dependencies = [ "log", "memmap", "near-proximity", + "num-traits", "obkv", "once_cell", "ordered-float", + "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", + "pest_derive", "rayon", "ringtail", "roaring", @@ -1231,6 +1240,57 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sha-1 0.8.2", +] + [[package]] name = "pin-project" version = "0.4.27" @@ -2024,6 +2084,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unicase" version = "2.6.0" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index e03261641..ca1ddcd45 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -614,7 +614,8 @@ async fn main() -> anyhow::Result<()> { search.query(query); } if let Some(condition) = query.facet_condition { - if let Some(condition) = FacetCondition::from_str(&rtxn, &index, &condition).unwrap() { + if !condition.trim().is_empty() { + let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); search.facet_condition(condition); } } diff --git a/src/lib.rs b/src/lib.rs index ff578dd4b..320077b86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#[macro_use] extern crate pest_derive; + mod criterion; mod external_documents_ids; mod fields_ids_map; diff --git a/src/search/facet.rs b/src/search/facet.rs deleted file mode 100644 index 08daf5fbc..000000000 --- a/src/search/facet.rs +++ /dev/null @@ -1,307 +0,0 @@ -use std::error::Error as StdError; -use std::fmt::Debug; -use std::ops::Bound::{self, Unbounded, Included, Excluded}; -use std::str::FromStr; - -use anyhow::{bail, ensure, Context}; -use heed::types::{ByteSlice, DecodeIgnore}; -use itertools::Itertools; -use log::debug; -use num_traits::Bounded; -use roaring::RoaringBitmap; - -use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; -use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; -use crate::{Index, CboRoaringBitmapCodec}; - -use self::FacetCondition::*; -use self::FacetNumberOperator::*; - -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetNumberOperator { - GreaterThan(T), - GreaterThanOrEqual(T), - LowerThan(T), - LowerThanOrEqual(T), - Equal(T), - Between(T, T), -} - -#[derive(Debug, Clone, PartialEq)] -pub enum FacetStringOperator { - Equal(String), -} - -// TODO also support ANDs, ORs, NOTs. -#[derive(Debug, Clone, PartialEq)] -pub enum FacetCondition { - OperatorI64(u8, FacetNumberOperator), - OperatorF64(u8, FacetNumberOperator), - OperatorString(u8, FacetStringOperator), -} - -impl FacetCondition { - pub fn from_str( - rtxn: &heed::RoTxn, - index: &Index, - string: &str, - ) -> anyhow::Result> - { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; - - // TODO use a better parsing technic - let mut iter = string.split_whitespace(); - - let field_name = match iter.next() { - Some(field_name) => field_name, - None => return Ok(None), - }; - - let field_id = fields_ids_map.id(&field_name).with_context(|| format!("field {} not found", field_name))?; - let field_type = faceted_fields.get(&field_id).with_context(|| format!("field {} is not faceted", field_name))?; - - match field_type { - FacetType::Integer => Self::parse_number_condition(iter).map(|op| Some(OperatorI64(field_id, op))), - FacetType::Float => Self::parse_number_condition(iter).map(|op| Some(OperatorF64(field_id, op))), - FacetType::String => Self::parse_string_condition(iter).map(|op| Some(OperatorString(field_id, op))), - } - } - - fn parse_string_condition<'a>( - mut iter: impl Iterator, - ) -> anyhow::Result - { - match iter.next() { - Some("=") | Some(":") => { - match iter.next() { - Some(q @ "\"") | Some(q @ "\'") => { - let string: String = iter.take_while(|&c| c != q).intersperse(" ").collect(); - Ok(FacetStringOperator::Equal(string.to_lowercase())) - }, - Some(param) => Ok(FacetStringOperator::Equal(param.to_lowercase())), - None => bail!("missing parameter"), - } - }, - _ => bail!("invalid facet string operator"), - } - } - - fn parse_number_condition<'a, T: FromStr>( - mut iter: impl Iterator, - ) -> anyhow::Result> - where T::Err: Send + Sync + StdError + 'static, - { - match iter.next() { - Some(">") => { - let param = iter.next().context("missing parameter")?; - let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(GreaterThan(value)) - }, - Some(">=") => { - let param = iter.next().context("missing parameter")?; - let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(GreaterThanOrEqual(value)) - }, - Some("<") => { - let param = iter.next().context("missing parameter")?; - let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(LowerThan(value)) - }, - Some("<=") => { - let param = iter.next().context("missing parameter")?; - let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(LowerThanOrEqual(value)) - }, - Some("=") => { - let param = iter.next().context("missing parameter")?; - let value = param.parse().with_context(|| format!("invalid parameter ({:?})", param))?; - Ok(Equal(value)) - }, - Some(otherwise) => { - // BETWEEN or X TO Y (both inclusive) - let left_param = otherwise.parse().with_context(|| format!("invalid first TO parameter ({:?})", otherwise))?; - ensure!(iter.next().map_or(false, |s| s.eq_ignore_ascii_case("to")), "TO keyword missing or invalid"); - let next = iter.next().context("missing second TO parameter")?; - let right_param = next.parse().with_context(|| format!("invalid second TO parameter ({:?})", next))?; - Ok(Between(left_param, right_param)) - }, - None => bail!("missing facet filter first parameter"), - } - } - - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_levels<'t, T: 't, KC>( - rtxn: &'t heed::RoTxn, - db: heed::Database, - field_id: u8, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> anyhow::Result<()> - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, - { - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); - }, - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let left_bound = match left { - Included(left) => Included((field_id, level, left, T::min_value())), - Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Unbounded, - }; - let right_bound = Included((field_id, level, T::max_value(), T::max_value())); - // We also make sure that we don't decode the data before we are sure we must return it. - let iter = db - .remap_key_type::() - .lazily_decode_data() - .range(rtxn, &(left_bound, right_bound))? - .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { - match right { - Included(right) => *r <= right, - Excluded(right) => *r < right, - Unbounded => true, - } - })) - .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - output.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; - } - }, - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; - }, - } - - Ok(()) - } - - fn evaluate_number_operator<'t, T: 't, KC>( - rtxn: &'t heed::RoTxn, - db: heed::Database, - field_id: u8, - operator: FacetNumberOperator, - ) -> anyhow::Result - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, - { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(T::max_value())), - GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), - LowerThan(val) => (Included(T::min_value()), Excluded(val)), - LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), - Equal(val) => (Included(val), Included(val)), - Between(left, right) => (Included(left), Included(right)), - }; - - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_types::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; - Ok(output) - }, - None => Ok(RoaringBitmap::new()), - } - } - - fn evaluate_string_operator( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: u8, - operator: &FacetStringOperator, - ) -> anyhow::Result - { - match operator { - FacetStringOperator::Equal(string) => { - match db.get(rtxn, &(field_id, string))? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()) - } - } - } - } - - pub fn evaluate( - &self, - rtxn: &heed::RoTxn, - db: heed::Database, - ) -> anyhow::Result - { - match self { - OperatorI64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) - }, - OperatorF64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) - }, - OperatorString(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, db, *fid, op) - }, - } - } -} diff --git a/src/search/facet/grammar.pest b/src/search/facet/grammar.pest new file mode 100644 index 000000000..2096517d3 --- /dev/null +++ b/src/search/facet/grammar.pest @@ -0,0 +1,29 @@ +key = _{quoted | word} +value = _{quoted | word} +quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } +string = {char*} +word = ${(LETTER | NUMBER | "_" | "-" | ".")+} + +char = _{ !(PEEK | "\\") ~ ANY + | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") + | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} + +condition = _{between | eq | greater | less | geq | leq | neq} +between = {key ~ value ~ "TO" ~ value} +geq = {key ~ ">=" ~ value} +leq = {key ~ "<=" ~ value} +neq = {key ~ "!=" ~ value} +eq = {key ~ "=" ~ value} +greater = {key ~ ">" ~ value} +less = {key ~ "<" ~ value} + +prgm = {SOI ~ expr ~ EOI} +expr = _{ ( term ~ (operation ~ term)* ) } +term = { ("(" ~ expr ~ ")") | condition | not } +operation = _{ and | or } +and = {"AND"} +or = {"OR"} + +not = {"NOT" ~ term} + +WHITESPACE = _{ " " } diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs new file mode 100644 index 000000000..b1d527337 --- /dev/null +++ b/src/search/facet/mod.rs @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Bound::{self, Unbounded, Included, Excluded}; + +use heed::types::{ByteSlice, DecodeIgnore}; +use log::debug; +use num_traits::Bounded; +use parser::{PREC_CLIMBER, FilterParser}; +use pest::error::{Error as PestError, ErrorVariant}; +use pest::iterators::{Pair, Pairs}; +use pest::Parser; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec}; + +use self::FacetCondition::*; +use self::FacetNumberOperator::*; +use self::parser::Rule; + +mod parser; + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum FacetNumberOperator { + GreaterThan(T), + GreaterThanOrEqual(T), + LowerThan(T), + LowerThanOrEqual(T), + Equal(T), + Between(T, T), +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetStringOperator { + Equal(String), +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetCondition { + OperatorI64(u8, FacetNumberOperator), + OperatorF64(u8, FacetNumberOperator), + OperatorString(u8, FacetStringOperator), + Or(Box, Box), + And(Box, Box), + Not(Box), +} + +fn get_field_id_facet_type<'a>( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + items: &mut Pairs<'a, Rule>, +) -> Result<(u8, FacetType), PestError> +{ + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + let field_id = fields_ids_map + .id(key.as_str()) + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + let facet_type = faceted_fields + .get(&field_id) + .copied() + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not faceted, available faceted attributes are: {}", + key.as_str(), + faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + Ok((field_id, facet_type)) +} + +impl FacetCondition { + pub fn from_str( + rtxn: &heed::RoTxn, + index: &Index, + expression: &str, + ) -> anyhow::Result + { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let lexed = FilterParser::parse(Rule::prgm, expression)?; + FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) + } + + fn from_pairs( + fim: &FieldsIdsMap, + ff: &HashMap, + expression: Pairs, + ) -> anyhow::Result + { + PREC_CLIMBER.climb( + expression, + |pair: Pair| match pair.as_rule() { + Rule::between => Ok(FacetCondition::between(fim, ff, pair)?), + Rule::eq => Ok(FacetCondition::equal(fim, ff, pair)?), + Rule::neq => Ok(Not(Box::new(FacetCondition::equal(fim, ff, pair)?))), + Rule::greater => Ok(FacetCondition::greater_than(fim, ff, pair)?), + Rule::geq => Ok(FacetCondition::greater_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(FacetCondition::lower_than(fim, ff, pair)?), + Rule::leq => Ok(FacetCondition::lower_than_or_equal(fim, ff, pair)?), + Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::not => Ok(Not(Box::new(Self::from_pairs(fim, ff, pair.into_inner())?))), + _ => unreachable!(), + }, + |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), + } + }, + ) + } + + fn between( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let lvalue = items.next().unwrap(); + let rvalue = items.next().unwrap(); + match ftype { + FacetType::Integer => { + let lvalue = lvalue.as_str().parse()?; + let rvalue = rvalue.as_str().parse()?; + Ok(OperatorI64(fid, Between(lvalue, rvalue))) + }, + FacetType::Float => { + let lvalue = lvalue.as_str().parse()?; + let rvalue = rvalue.as_str().parse()?; + Ok(OperatorF64(fid, Between(lvalue, rvalue))) + }, + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, Equal(value.as_str().parse()?))), + FacetType::Float => Ok(OperatorF64(fid, Equal(value.as_str().parse()?))), + FacetType::String => { + Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string()))) + }, + } + } + + fn greater_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(value.as_str().parse()?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThan(value.as_str().parse()?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn greater_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(value.as_str().parse()?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(value.as_str().parse()?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThan(value.as_str().parse()?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThan(value.as_str().parse()?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(value.as_str().parse()?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(value.as_str().parse()?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } +} + +impl FacetCondition { + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + // We also make sure that we don't decode the data before we are sure we must return it. + let iter = db + .remap_key_type::() + .lazily_decode_data() + .range(rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })) + .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + + fn evaluate_number_operator<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + operator: FacetNumberOperator, + ) -> anyhow::Result + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(T::max_value())), + GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + LowerThan(val) => (Included(T::min_value()), Excluded(val)), + LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), + Equal(val) => (Included(val), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_types::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Ok(output) + }, + None => Ok(RoaringBitmap::new()), + } + } + + fn evaluate_string_operator( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: u8, + operator: &FacetStringOperator, + ) -> anyhow::Result + { + match operator { + FacetStringOperator::Equal(string) => { + match db.get(rtxn, &(field_id, string))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()) + } + } + } + } + + pub fn evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + ) -> anyhow::Result + { + let db = index.facet_field_id_value_docids; + match self { + OperatorI64(fid, op) => { + Self::evaluate_number_operator::(rtxn, db, *fid, *op) + }, + OperatorF64(fid, op) => { + Self::evaluate_number_operator::(rtxn, db, *fid, *op) + }, + OperatorString(fid, op) => { + let db = db.remap_key_type::(); + Self::evaluate_string_operator(rtxn, db, *fid, op) + }, + Or(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs | rhs) + }, + And(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs & rhs) + }, + Not(op) => { + // TODO is this right or is this wrong? because all documents ids are not faceted + // so doing that can return documents that are not faceted at all. + let all_documents_ids = index.documents_ids(rtxn)?; + let documents_ids = op.evaluate(rtxn, index)?; + Ok(all_documents_ids - documents_ids) + }, + } + } +} diff --git a/src/search/facet/parser.rs b/src/search/facet/parser.rs new file mode 100644 index 000000000..0e8bd23ac --- /dev/null +++ b/src/search/facet/parser.rs @@ -0,0 +1,12 @@ +use once_cell::sync::Lazy; +use pest::prec_climber::{Operator, Assoc, PrecClimber}; + +pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { + use Assoc::*; + use Rule::*; + pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) +}); + +#[derive(Parser)] +#[grammar = "search/facet/grammar.pest"] +pub struct FilterParser; diff --git a/src/search/mod.rs b/src/search/mod.rs index d236e396a..7020fa838 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -155,9 +155,8 @@ impl<'a> Search<'a> { }; // We create the original candidates with the facet conditions results. - let facet_db = self.index.facet_field_id_value_docids; let facet_candidates = match &self.facet_condition { - Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), + Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), None => None, }; From fc242f6e1f88c332db843a8d6af667a59ad6ea6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 15:40:11 +0100 Subject: [PATCH 24/30] Rewrite the FacetCondtion Debug impl in a defensive way --- src/search/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/search/mod.rs b/src/search/mod.rs index 7020fa838..af6ccaf26 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -210,11 +210,12 @@ impl<'a> Search<'a> { impl fmt::Debug for Search<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; f.debug_struct("Search") - .field("query", &self.query) - .field("facet_condition", &self.facet_condition) - .field("offset", &self.offset) - .field("limit", &self.limit) + .field("query", query) + .field("facet_condition", facet_condition) + .field("offset", offset) + .field("limit", limit) .finish() } } From 7370ef8c5ef940ad6b0979312616790f3f28fcd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 15:40:41 +0100 Subject: [PATCH 25/30] Add two simple test to the facet FacetCondition struct construction --- src/search/facet/mod.rs | 50 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index b1d527337..c39de878b 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -474,3 +474,53 @@ impl FacetCondition { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::update::Settings; + use heed::EnvOpenOptions; + use maplit::hashmap; + + #[test] + fn simple_string_equal() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); + let expected = FacetCondition::OperatorString(1, FacetStringOperator::Equal("ponce".into())); + assert_eq!(condition, expected); + } + + #[test] + fn simple_between_i64() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = FacetCondition::OperatorI64(1, FacetNumberOperator::Between(22, 44)); + assert_eq!(condition, expected); + } +} From fc686aaca781c36df7d856d10535151b96277a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Nov 2020 13:08:57 +0100 Subject: [PATCH 26/30] Use the De Morgan law to simplify the NOT operation --- http-ui/src/main.rs | 12 +-- src/index.rs | 22 ++++++ src/search/facet/mod.rs | 96 +++++++++++++++++------ src/update/clear_documents.rs | 6 ++ src/update/delete_documents.rs | 8 ++ src/update/{facet_levels.rs => facets.rs} | 80 ++++++++++++++----- src/update/index_documents/mod.rs | 4 +- src/update/mod.rs | 4 +- src/update/update_builder.rs | 8 +- 9 files changed, 182 insertions(+), 58 deletions(-) rename src/update/{facet_levels.rs => facets.rs} (80%) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ca1ddcd45..1c5385b14 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -197,7 +197,7 @@ enum UpdateMeta { DocumentsAddition { method: String, format: String }, ClearDocuments, Settings(Settings), - FacetLevels(FacetLevels), + Facets(Facets), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -236,7 +236,7 @@ struct Settings { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] -struct FacetLevels { +struct Facets { last_level_size: Option, number_of_levels: Option, easing_function: Option, @@ -411,10 +411,10 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e.into()) } }, - UpdateMeta::FacetLevels(levels) => { + UpdateMeta::Facets(levels) => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.facet_levels(&mut wtxn, &index_cloned); + let mut builder = update_builder.facets(&mut wtxn, &index_cloned); if let Some(value) = levels.last_level_size { builder.last_level_size(value); } @@ -806,8 +806,8 @@ async fn main() -> anyhow::Result<()> { let change_facet_levels_route = warp::filters::method::post() .and(warp::path!("facet-levels")) .and(warp::body::json()) - .map(move |levels: FacetLevels| { - let meta = UpdateMeta::FacetLevels(levels); + .map(move |levels: Facets| { + let meta = UpdateMeta::Facets(levels); let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); eprintln!("update {} registered", update_id); diff --git a/src/index.rs b/src/index.rs index ccaba4ca6..b21c7d39b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -18,6 +18,7 @@ use crate::{ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; +pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -224,6 +225,27 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } + /* faceted documents ids */ + + /// Writes the documents ids that are faceted under this field id. + pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that faceted under this field id. + pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index c39de878b..1d20ca1bd 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -26,15 +26,42 @@ mod parser; pub enum FacetNumberOperator { GreaterThan(T), GreaterThanOrEqual(T), + Equal(T), + NotEqual(T), LowerThan(T), LowerThanOrEqual(T), - Equal(T), Between(T, T), } +impl FacetNumberOperator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(x) => (LowerThanOrEqual(x), None), + GreaterThanOrEqual(x) => (LowerThan(x), None), + Equal(x) => (NotEqual(x), None), + NotEqual(x) => (Equal(x), None), + LowerThan(x) => (GreaterThanOrEqual(x), None), + LowerThanOrEqual(x) => (GreaterThan(x), None), + Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), + } + } +} + #[derive(Debug, Clone, PartialEq)] pub enum FacetStringOperator { Equal(String), + NotEqual(String), +} + +impl FacetStringOperator { + fn negate(self) -> Self { + match self { + FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), + FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + } + } } #[derive(Debug, Clone, PartialEq)] @@ -44,7 +71,6 @@ pub enum FacetCondition { OperatorString(u8, FacetStringOperator), Or(Box, Box), And(Box, Box), - Not(Box), } fn get_field_id_facet_type<'a>( @@ -106,24 +132,24 @@ impl FacetCondition { fim: &FieldsIdsMap, ff: &HashMap, expression: Pairs, - ) -> anyhow::Result + ) -> anyhow::Result { PREC_CLIMBER.climb( expression, |pair: Pair| match pair.as_rule() { - Rule::between => Ok(FacetCondition::between(fim, ff, pair)?), - Rule::eq => Ok(FacetCondition::equal(fim, ff, pair)?), - Rule::neq => Ok(Not(Box::new(FacetCondition::equal(fim, ff, pair)?))), - Rule::greater => Ok(FacetCondition::greater_than(fim, ff, pair)?), - Rule::geq => Ok(FacetCondition::greater_than_or_equal(fim, ff, pair)?), - Rule::less => Ok(FacetCondition::lower_than(fim, ff, pair)?), - Rule::leq => Ok(FacetCondition::lower_than_or_equal(fim, ff, pair)?), + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), - Rule::not => Ok(Not(Box::new(Self::from_pairs(fim, ff, pair.into_inner())?))), _ => unreachable!(), }, - |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { match op.as_rule() { Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), @@ -133,6 +159,22 @@ impl FacetCondition { ) } + fn negate(self) -> FacetCondition { + match self { + OperatorI64(fid, op) => match op.negate() { + (op, None) => OperatorI64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), + }, + OperatorF64(fid, op) => match op.negate() { + (op, None) => OperatorF64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), + }, + OperatorString(fid, op) => OperatorString(fid, op.negate()), + Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), + And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), + } + } + fn between( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashMap, @@ -381,6 +423,7 @@ impl FacetCondition { fn evaluate_number_operator<'t, T: 't, KC>( rtxn: &'t heed::RoTxn, + index: &Index, db: heed::Database, field_id: u8, operator: FacetNumberOperator, @@ -396,9 +439,14 @@ impl FacetCondition { let (left, right) = match operator { GreaterThan(val) => (Excluded(val), Included(T::max_value())), GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + Equal(val) => (Included(val), Included(val)), + NotEqual(val) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; + return Ok(all_documents_ids - docids); + }, LowerThan(val) => (Included(T::min_value()), Excluded(val)), LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), - Equal(val) => (Included(val), Included(val)), Between(left, right) => (Included(left), Included(right)), }; @@ -421,6 +469,7 @@ impl FacetCondition { fn evaluate_string_operator( rtxn: &heed::RoTxn, + index: &Index, db: heed::Database, field_id: u8, operator: &FacetStringOperator, @@ -432,7 +481,13 @@ impl FacetCondition { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()) } - } + }, + FacetStringOperator::NotEqual(string) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let op = FacetStringOperator::Equal(string.clone()); + let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; + return Ok(all_documents_ids - docids); + }, } } @@ -445,14 +500,14 @@ impl FacetCondition { let db = index.facet_field_id_value_docids; match self { OperatorI64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) }, OperatorF64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) }, OperatorString(fid, op) => { let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, db, *fid, op) + Self::evaluate_string_operator(rtxn, index, db, *fid, op) }, Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; @@ -464,13 +519,6 @@ impl FacetCondition { let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs & rhs) }, - Not(op) => { - // TODO is this right or is this wrong? because all documents ids are not faceted - // so doing that can return documents that are not faceted at all. - let all_documents_ids = index.documents_ids(rtxn)?; - let documents_ids = op.evaluate(rtxn, index)?; - Ok(all_documents_ids - documents_ids) - }, } } } diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 447dca8b4..5dc14f97d 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; + let faceted_fields = self.index.faceted_fields(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + // We clean all the faceted documents ids. + for (field_id, _) in faceted_fields { + self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; + } + // Clear the other databases. word_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 9924080f2..b1db4f94c 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -184,6 +184,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // Remove the documents ids from the faceted documents ids. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + for (field_id, _) in faceted_fields { + let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + } + // We delete the documents ids that are under the facet field id values. let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; while let Some(result) = iter.next() { diff --git a/src/update/facet_levels.rs b/src/update/facets.rs similarity index 80% rename from src/update/facet_levels.rs rename to src/update/facets.rs index 4a7769b7a..96a7e825e 100644 --- a/src/update/facet_levels.rs +++ b/src/update/facets.rs @@ -24,7 +24,7 @@ pub enum EasingName { Linear, } -pub struct FacetLevels<'t, 'u, 'i> { +pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, pub(crate) chunk_compression_type: CompressionType, @@ -35,9 +35,9 @@ pub struct FacetLevels<'t, 'u, 'i> { easing_function: EasingName, } -impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> FacetLevels<'t, 'u, 'i> { - FacetLevels { +impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { + Facets { wtxn, index, chunk_compression_type: CompressionType::None, @@ -70,7 +70,7 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for (field_id, facet_type) in faceted_fields { - let content = match facet_type { + let (content, documents_ids) = match facet_type { FacetType::Integer => { clear_field_levels::( self.wtxn, @@ -78,7 +78,13 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { field_id, )?; - compute_facet_levels::( + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( self.wtxn, self.index.facet_field_id_value_docids, self.chunk_compression_type, @@ -88,7 +94,9 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { self.number_of_levels, self.easing_function, field_id, - )? + )?; + + (Some(content), documents_ids) }, FacetType::Float => { clear_field_levels::( @@ -97,7 +105,13 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { field_id, )?; - compute_facet_levels::( + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( self.wtxn, self.index.facet_field_id_value_docids, self.chunk_compression_type, @@ -107,18 +121,32 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { self.number_of_levels, self.easing_function, field_id, - )? + )?; + + (Some(content), documents_ids) + }, + FacetType::String => { + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + (None, documents_ids) }, - FacetType::String => continue, }; - write_into_lmdb_database( - self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), - content, - |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, - )?; + if let Some(content) = content { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + } + + self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; } Ok(()) @@ -138,9 +166,7 @@ where let left = (field_id, 1, T::min_value(), T::min_value()); let right = (field_id, u8::MAX, T::max_value(), T::max_value()); let range = left..=right; - db.remap_key_type::() - .delete_range(wtxn, &range) - .map(drop) + db.remap_key_type::().delete_range(wtxn, &range).map(drop) } fn compute_facet_levels<'t, T: 't, KC>( @@ -217,6 +243,20 @@ where writer_into_reader(writer, shrink_size) } +fn compute_faceted_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: u8, +) -> anyhow::Result +{ + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &[field_id])? { + let (_key, docids) = result?; + documents_ids.union_with(&docids); + } + Ok(documents_ids) +} + fn write_entry( writer: &mut Writer, field_id: u8, diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index d48696e94..362175ce5 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -16,7 +16,7 @@ use rayon::prelude::*; use rayon::ThreadPool; use crate::index::Index; -use crate::update::{FacetLevels, UpdateIndexingStep}; +use crate::update::{Facets, UpdateIndexingStep}; use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, @@ -584,7 +584,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } - let mut builder = FacetLevels::new(self.wtxn, self.index); + let mut builder = Facets::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; diff --git a/src/update/mod.rs b/src/update/mod.rs index 87035065c..416e88464 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -1,7 +1,7 @@ mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facet_levels; +mod facets; mod index_documents; mod settings; mod update_builder; @@ -12,7 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; -pub use self::facet_levels::{FacetLevels, EasingName}; +pub use self::facets::{Facets, EasingName}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index 8f7f1a0a8..b973bd535 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, FacetLevels}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -119,13 +119,13 @@ impl<'a> UpdateBuilder<'a> { builder } - pub fn facet_levels<'t, 'u, 'i>( + pub fn facets<'t, 'u, 'i>( self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> FacetLevels<'t, 'u, 'i> + ) -> Facets<'t, 'u, 'i> { - let mut builder = FacetLevels::new(wtxn, index); + let mut builder = Facets::new(wtxn, index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; From 54d5cec582ccb9759f6cc36c30fbe3083eefa58c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Nov 2020 15:13:47 +0100 Subject: [PATCH 27/30] Transform numbers into strings when faceted and necessary --- src/update/index_documents/store.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 6fb07b345..25c343910 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result Ok(()), Value::Bool(b) => Ok(output.push(Integer(*b as i64))), Value::Number(number) => match ftype { - FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), + FacetType::String => { + let string = SmallString32::from(number.to_string()); + Ok(output.push(String(string))) + }, FacetType::Float => match number.as_f64() { Some(float) => Ok(output.push(Float(OrderedFloat(float)))), None => bail!("invalid facet type, expecting {} found integer", ftype), From a50f63840f298bd5addb1a34df0c65cdfd326958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Nov 2020 15:42:12 +0100 Subject: [PATCH 28/30] Return spanned pest error while parsing numbers in facet filters --- src/search/facet/mod.rs | 44 ++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index 1d20ca1bd..f1286b964 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fmt::Debug; use std::ops::Bound::{self, Unbounded, Included, Excluded}; +use std::str::FromStr; use heed::types::{ByteSlice, DecodeIgnore}; use log::debug; @@ -115,6 +116,21 @@ fn get_field_id_facet_type<'a>( Ok((field_id, facet_type)) } +fn pest_parse(pair: Pair) -> Result> +where T: FromStr, + T::Err: ToString, +{ + match pair.as_str().parse() { + Ok(value) => Ok(value), + Err(e) => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )) + } + } +} + impl FacetCondition { pub fn from_str( rtxn: &heed::RoTxn, @@ -188,13 +204,13 @@ impl FacetCondition { let rvalue = items.next().unwrap(); match ftype { FacetType::Integer => { - let lvalue = lvalue.as_str().parse()?; - let rvalue = rvalue.as_str().parse()?; + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; Ok(OperatorI64(fid, Between(lvalue, rvalue))) }, FacetType::Float => { - let lvalue = lvalue.as_str().parse()?; - let rvalue = rvalue.as_str().parse()?; + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; Ok(OperatorF64(fid, Between(lvalue, rvalue))) }, FacetType::String => { @@ -218,8 +234,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, Equal(value.as_str().parse()?))), - FacetType::Float => Ok(OperatorF64(fid, Equal(value.as_str().parse()?))), + FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), FacetType::String => { Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string()))) }, @@ -237,8 +253,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(value.as_str().parse()?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThan(value.as_str().parse()?))), + FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -261,8 +277,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(value.as_str().parse()?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(value.as_str().parse()?))), + FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -285,8 +301,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThan(value.as_str().parse()?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThan(value.as_str().parse()?))), + FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -309,8 +325,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(value.as_str().parse()?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(value.as_str().parse()?))), + FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { From 276c87af6875f4b85ae584cc0a3a36c9df09dce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Nov 2020 16:37:55 +0100 Subject: [PATCH 29/30] Introduce more test to the FacetCondition struct --- src/search/facet/mod.rs | 73 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index f1286b964..06f543dcc 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -547,7 +547,7 @@ mod tests { use maplit::hashmap; #[test] - fn simple_string_equal() { + fn string() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -563,12 +563,20 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); - let expected = FacetCondition::OperatorString(1, FacetStringOperator::Equal("ponce".into())); + let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into())); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); assert_eq!(condition, expected); } #[test] - fn simple_between_i64() { + fn i64() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -584,7 +592,64 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = FacetCondition::OperatorI64(1, FacetNumberOperator::Between(22, 44)); + let expected = OperatorI64(1, Between(22, 44)); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order + builder.set_faceted_fields(hashmap!{ + "channel".into() => "string".into(), + "timestamp".into() => "integer".into(), + }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), + Box::new(And( + Box::new(OperatorI64(1, Between(22, 44))), + Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))), + )) + ); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), + Box::new(Or( + Box::new(Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + )), + Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))), + )), + ); assert_eq!(condition, expected); } } From ba4ba685f99c5543d0d16393a6851352899149ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 28 Nov 2020 12:43:43 +0100 Subject: [PATCH 30/30] Make the facet levels maps to previous level groups and don't split them --- http-ui/src/main.rs | 32 +++------ src/update/facets.rs | 116 +++++++----------------------- src/update/index_documents/mod.rs | 16 ++--- src/update/mod.rs | 2 +- 4 files changed, 41 insertions(+), 125 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 1c5385b14..80402f0a0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -28,7 +28,7 @@ use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::UpdateIndexingStep::*; -use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat, EasingName}; +use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -237,9 +237,8 @@ struct Settings { #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct Facets { - last_level_size: Option, - number_of_levels: Option, - easing_function: Option, + level_group_size: Option, + min_level_size: Option, } // Any value that is present is considered Some value, including null. @@ -415,27 +414,12 @@ async fn main() -> anyhow::Result<()> { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; let mut builder = update_builder.facets(&mut wtxn, &index_cloned); - if let Some(value) = levels.last_level_size { - builder.last_level_size(value); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); } - if let Some(value) = levels.number_of_levels { - builder.number_of_levels(value); + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); } - if let Some(value) = levels.easing_function { - let easing_name = if value.eq_ignore_ascii_case("expo") { - EasingName::Expo - } else if value.eq_ignore_ascii_case("quart") { - EasingName::Quart - } else if value.eq_ignore_ascii_case("circ") { - EasingName::Circ - } else if value.eq_ignore_ascii_case("linear") { - EasingName::Linear - } else { - panic!("Invalid easing function name") - }; - builder.easing_function(easing_name); - } - match builder.execute() { Ok(()) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) @@ -804,7 +788,7 @@ async fn main() -> anyhow::Result<()> { let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let change_facet_levels_route = warp::filters::method::post() - .and(warp::path!("facet-levels")) + .and(warp::path!("facet-level-sizes")) .and(warp::body::json()) .map(move |levels: Facets| { let meta = UpdateMeta::Facets(levels); diff --git a/src/update/facets.rs b/src/update/facets.rs index 96a7e825e..e26f030df 100644 --- a/src/update/facets.rs +++ b/src/update/facets.rs @@ -1,10 +1,10 @@ +use std::cmp; use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; -use itertools::Itertools; use log::debug; use num_traits::{Bounded, Zero}; use roaring::RoaringBitmap; @@ -16,23 +16,14 @@ use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; -#[derive(Debug, Copy, Clone)] -pub enum EasingName { - Expo, - Quart, - Circ, - Linear, -} - pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, - number_of_levels: NonZeroUsize, - last_level_size: NonZeroUsize, - easing_function: EasingName, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, } impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { @@ -43,24 +34,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, - number_of_levels: NonZeroUsize::new(5).unwrap(), - last_level_size: NonZeroUsize::new(5).unwrap(), - easing_function: EasingName::Expo, + level_group_size: NonZeroUsize::new(4).unwrap(), + min_level_size: NonZeroUsize::new(5).unwrap(), } } - pub fn number_of_levels(&mut self, value: NonZeroUsize) -> &mut Self { - self.number_of_levels = value; + pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); self } - pub fn last_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.last_level_size = value; - self - } - - pub fn easing_function(&mut self, value: EasingName) -> &mut Self { - self.easing_function = value; + pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.min_level_size = value; self } @@ -90,9 +75,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, - self.last_level_size, - self.number_of_levels, - self.easing_function, + self.level_group_size, + self.min_level_size, field_id, )?; @@ -117,9 +101,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, - self.last_level_size, - self.number_of_levels, - self.easing_function, + self.level_group_size, + self.min_level_size, field_id, )?; @@ -175,9 +158,8 @@ fn compute_facet_levels<'t, T: 't, KC>( compression_type: CompressionType, compression_level: Option, shrink_size: Option, - last_level_size: NonZeroUsize, - number_of_levels: NonZeroUsize, - easing_function: EasingName, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, field_id: u8, ) -> anyhow::Result> where @@ -201,15 +183,13 @@ where left..=right }; - let level_sizes_iter = - levels_iterator(first_level_size, last_level_size.get(), number_of_levels.get(), easing_function) - .map(|size| (first_level_size as f64 / size as f64).ceil() as usize) - .unique() - .enumerate() - .skip(1); + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - // TODO we must not create levels with identical group sizes. - for (level, level_entry_sizes) in level_sizes_iter { + for (level, group_size) in group_size_iter { let mut left = T::zero(); let mut right = T::zero(); let mut group_docids = RoaringBitmap::new(); @@ -220,10 +200,10 @@ where if i == 0 { left = value; - } else if i % level_entry_sizes == 0 { + } else if i % group_size == 0 { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. - write_entry::(&mut writer, field_id, level as u8, left, right, &group_docids)?; + write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); @@ -236,7 +216,7 @@ where } if !group_docids.is_empty() { - write_entry::(&mut writer, field_id, level as u8, left, right, &group_docids)?; + write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; } } @@ -274,51 +254,3 @@ where writer.insert(&key, &data)?; Ok(()) } - -fn levels_iterator( - first_level_size: usize, // biggest level - last_level_size: usize, // smallest level - number_of_levels: usize, - easing_function: EasingName, -) -> impl Iterator -{ - let easing_function = match easing_function { - EasingName::Expo => ease_out_expo, - EasingName::Quart => ease_out_quart, - EasingName::Circ => ease_out_circ, - EasingName::Linear => ease_out_linear, - }; - - let b = last_level_size as f64; - let end = first_level_size as f64; - let c = end - b; - let d = number_of_levels; - (0..=d).map(move |t| ((end + b) - easing_function(t as f64, b, c, d as f64)) as usize) -} - -// Go look at the function definitions here: -// https://docs.rs/easer/0.2.1/easer/index.html -// https://easings.net/#easeOutExpo -fn ease_out_expo(t: f64, b: f64, c: f64, d: f64) -> f64 { - if t == d { - b + c - } else { - c * (-2.0_f64.powf(-10.0 * t / d) + 1.0) + b - } -} - -// https://easings.net/#easeOutCirc -fn ease_out_circ(t: f64, b: f64, c: f64, d: f64) -> f64 { - let t = t / d - 1.0; - c * (1.0 - t * t).sqrt() + b -} - -// https://easings.net/#easeOutQuart -fn ease_out_quart(t: f64, b: f64, c: f64, d: f64) -> f64 { - let t = t / d - 1.0; - -c * ((t * t * t * t) - 1.0) + b -} - -fn ease_out_linear(t: f64, b: f64, c: f64, d: f64) -> f64 { - c * t / d + b -} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 362175ce5..4a3ec43f9 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -208,8 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, - facet_number_of_levels: Option, - facet_last_level_size: Option, + facet_level_group_size: Option, + facet_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -228,8 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_level: None, chunk_fusing_shrink_size: None, thread_pool: None, - facet_number_of_levels: None, - facet_last_level_size: None, + facet_level_group_size: None, + facet_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -588,11 +588,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - if let Some(value) = self.facet_number_of_levels { - builder.number_of_levels(value); + if let Some(value) = self.facet_level_group_size { + builder.level_group_size(value); } - if let Some(value) = self.facet_last_level_size { - builder.last_level_size(value); + if let Some(value) = self.facet_min_level_size { + builder.min_level_size(value); } builder.execute()?; diff --git a/src/update/mod.rs b/src/update/mod.rs index 416e88464..d05396f00 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -12,7 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; -pub use self::facets::{Facets, EasingName}; +pub use self::facets::Facets; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep;