diff --git a/Cargo.lock b/Cargo.lock index ddb2e9ec5..70128cfa9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,6 +45,27 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder", + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", +] + [[package]] name = "bstr" version = "0.2.13" @@ -63,6 +84,12 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + [[package]] name = "byteorder" version = "1.3.4" @@ -285,12 +312,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array", +] + [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + [[package]] name = "flate2" version = "1.0.17" @@ -324,6 +366,15 @@ dependencies = [ "byteorder", ] +[[package]] +name = "generic-array" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" +dependencies = [ + "typenum", +] + [[package]] name = "getrandom" version = "0.1.14" @@ -378,9 +429,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.1" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" +checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" dependencies = [ "byteorder", "heed-traits", @@ -617,9 +668,12 @@ dependencies = [ "maplit", "memmap", "near-proximity", + "num-traits", "obkv", "once_cell", "ordered-float", + "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", + "pest_derive", "rayon", "ringtail", "roaring", @@ -675,9 +729,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", ] @@ -716,6 +770,12 @@ version = "11.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + [[package]] name = "ordered-float" version = "2.0.0" @@ -741,6 +801,57 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sha-1", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -1025,6 +1136,18 @@ dependencies = [ "serde", ] +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer", + "digest", + "fake-simd", + "opaque-debug", +] + [[package]] name = "slice-group-by" version = "0.2.6" @@ -1233,6 +1356,18 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +[[package]] +name = "typenum" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" + +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unicode-bidi" version = "0.3.4" diff --git a/Cargo.toml b/Cargo.toml index fd453a5f2..37c83b4f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,13 +14,14 @@ flate2 = "1.0.17" fst = "0.4.4" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } +num-traits = "0.2.14" obkv = "0.1.0" once_cell = "1.4.0" ordered-float = "2.0.0" @@ -36,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he tempfile = "3.1.0" uuid = { version = "0.8.1", features = ["v4"] } +# facet filter parser +pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } +pest_derive = "2.1.0" + # documents words self-join itertools = "0.9.0" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 23fed5bbe..b15700ce5 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -654,9 +654,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.1" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" +checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" dependencies = [ "byteorder", "heed-traits", @@ -934,6 +934,12 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matches" version = "0.1.8" @@ -987,9 +993,12 @@ dependencies = [ "log", "memmap", "near-proximity", + "num-traits", "obkv", "once_cell", "ordered-float", + "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", + "pest_derive", "rayon", "ringtail", "roaring", @@ -1231,6 +1240,57 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sha-1 0.8.2", +] + [[package]] name = "pin-project" version = "0.4.27" @@ -2024,6 +2084,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unicase" version = "2.6.0" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 7e28e1211..b30fb95c2 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.28" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = "0.10.1" +heed = "0.10.4" memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 70b9e4da1..fb7a95cc9 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -1,8 +1,9 @@ var request = null; var timeoutID = null; -$('#search').on('input', function () { - var query = $(this).val(); +$('#query, #facet').on('input', function () { + var query = $('#query').val(); + var facet = $('#facet').val(); var timeoutMs = 100; if (timeoutID !== null) { @@ -14,7 +15,7 @@ $('#search').on('input', function () { type: "POST", url: "query", contentType: 'application/json', - data: JSON.stringify({ 'query': query }), + data: JSON.stringify({ 'query': query, 'facetCondition': facet }), contentType: 'application/json', success: function (data, textStatus, request) { results.innerHTML = ''; @@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) { // We trigger the input when we load the script, this way // we execute a placeholder search when the input is empty. $(window).on('load', function () { - $('#search').trigger('input'); + $('#query').trigger('input'); }); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b730344f2..80402f0a0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; +use std::num::NonZeroUsize; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -28,7 +29,7 @@ use warp::{Filter, http::Response}; use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -196,6 +197,7 @@ enum UpdateMeta { DocumentsAddition { method: String, format: String }, ClearDocuments, Settings(Settings), + Facets(Facets), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -231,6 +233,14 @@ struct Settings { faceted_attributes: Option>, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct Facets { + level_group_size: Option, + min_level_size: Option, +} + // Any value that is present is considered Some value, including null. fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> where T: Deserialize<'de>, @@ -399,6 +409,21 @@ async fn main() -> anyhow::Result<()> { Ok(_count) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } + }, + UpdateMeta::Facets(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.facets(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); + } + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()) + } } }; @@ -550,9 +575,12 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../public/logo-black.svg")) ); - #[derive(Deserialize)] + #[derive(Debug, Deserialize)] + #[serde(deny_unknown_fields)] + #[serde(rename_all = "camelCase")] struct QueryBody { query: Option, + facet_condition: Option, } let disable_highlighting = opt.disable_highlighting; @@ -569,6 +597,12 @@ async fn main() -> anyhow::Result<()> { if let Some(query) = query.query { search.query(query); } + if let Some(condition) = query.facet_condition { + if !condition.trim().is_empty() { + let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); + search.facet_condition(condition); + } + } let SearchResult { found_words, documents_ids } = search.execute().unwrap(); @@ -751,6 +785,19 @@ async fn main() -> anyhow::Result<()> { Ok(warp::reply()) }); + let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); + let change_facet_levels_route = warp::filters::method::post() + .and(warp::path!("facet-level-sizes")) + .and(warp::body::json()) + .map(move |levels: Facets| { + let meta = UpdateMeta::Facets(levels); + let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); + let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); + eprintln!("update {} registered", update_id); + warp::reply() + }); + let update_ws_route = warp::ws() .and(warp::path!("updates" / "ws")) .map(move |ws: warp::ws::Ws| { @@ -799,6 +846,7 @@ async fn main() -> anyhow::Result<()> { .or(indexing_json_stream_route) .or(clearing_route) .or(change_settings_route) + .or(change_facet_levels_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html index f2161457d..0ef239622 100644 --- a/http-ui/templates/index.html +++ b/http-ui/templates/index.html @@ -55,7 +55,8 @@
- + +
diff --git a/src/heed_codec/facet/facet_level_value_f64_codec.rs b/src/heed_codec/facet/facet_level_value_f64_codec.rs new file mode 100644 index 000000000..1ee8e6bf3 --- /dev/null +++ b/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -0,0 +1,86 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::f64_into_bytes; + +// TODO do not de/serialize right bound when level = 0 +pub struct FacetLevelValueF64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { + type DItem = (u8, u8, f64, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + let (left, right) = if *level != 0 { + let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; + let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; + (left, right) + } else { + let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + (left, left) + }; + + Some((*field_id, *level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { + type EItem = (u8, u8, f64, f64); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 32]; + + let len = if *level != 0 { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + + let bytes = f64_into_bytes(*right)?; + buffer[8..16].copy_from_slice(&bytes[..]); + + // Then the f64 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[16..24].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[24..].copy_from_slice(&bytes[..]); + + 32 // length + } else { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + + // Then the f64 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + 16 // length + }; + + let mut bytes = Vec::with_capacity(len + 2); + bytes.push(*field_id); + bytes.push(*level); + bytes.extend_from_slice(&buffer[..len]); + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::{BytesEncode, BytesDecode}; + use super::*; + + #[test] + fn globally_ordered_f64() { + let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); + let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); + + let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); + let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); + } +} diff --git a/src/heed_codec/facet/facet_level_value_i64_codec.rs b/src/heed_codec/facet/facet_level_value_i64_codec.rs new file mode 100644 index 000000000..7cf9a714b --- /dev/null +++ b/src/heed_codec/facet/facet_level_value_i64_codec.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; + +pub struct FacetLevelValueI64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { + type DItem = (u8, u8, i64, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + let left = bytes[..8].try_into().map(i64_from_bytes).ok()?; + let right = if *level != 0 { + bytes[8..].try_into().map(i64_from_bytes).ok()? + } else { + left + }; + + Some((*field_id, *level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { + type EItem = (u8, u8, i64, i64); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let left = i64_into_bytes(*left); + let right = i64_into_bytes(*right); + + let mut bytes = Vec::with_capacity(2 + left.len() + right.len()); + bytes.push(*field_id); + bytes.push(*level); + bytes.extend_from_slice(&left[..]); + if *level != 0 { + bytes.extend_from_slice(&right[..]); + } + + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs deleted file mode 100644 index 228514de5..000000000 --- a/src/heed_codec/facet/facet_value_f64_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; - -pub struct FacetValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { - type DItem = (u8, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, buffer) = bytes.split_first()?; - let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; - Some((*field_id, value)) - } -} - -impl heed::BytesEncode<'_> for FacetValueF64Codec { - type EItem = (u8, f64); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the globally ordered float. - let bytes = f64_into_bytes(*value)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 value just to be able to read it back. - let bytes = value.to_be_bytes(); - buffer[8..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(&buffer[..]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesEncode, BytesDecode}; - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); - let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, value), (3, -32.0)); - } -} diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs deleted file mode 100644 index f99b8a3ea..000000000 --- a/src/heed_codec/facet/facet_value_i64_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; - -pub struct FacetValueI64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { - type DItem = (u8, i64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, buffer) = bytes.split_first()?; - let value = buffer.try_into().map(i64_from_bytes).ok()?; - Some((*field_id, value)) - } -} - -impl heed::BytesEncode<'_> for FacetValueI64Codec { - type EItem = (u8, i64); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let value = i64_into_bytes(*value); - let mut bytes = Vec::with_capacity(value.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(&value[..]); - Some(Cow::Owned(bytes)) - } -} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs index abe2c1d8a..ef97e6add 100644 --- a/src/heed_codec/facet/mod.rs +++ b/src/heed_codec/facet/mod.rs @@ -1,7 +1,7 @@ -mod facet_value_f64_codec; -mod facet_value_i64_codec; +mod facet_level_value_f64_codec; +mod facet_level_value_i64_codec; mod facet_value_string_codec; -pub use self::facet_value_f64_codec::FacetValueF64Codec; -pub use self::facet_value_i64_codec::FacetValueI64Codec; +pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; diff --git a/src/index.rs b/src/index.rs index ccaba4ca6..b21c7d39b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -18,6 +18,7 @@ use crate::{ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; +pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -224,6 +225,27 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } + /* faceted documents ids */ + + /// Writes the documents ids that are faceted under this field id. + pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that faceted under this field id. + pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. diff --git a/src/lib.rs b/src/lib.rs index 12a24a59c..320077b86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#[macro_use] extern crate pest_derive; + mod criterion; mod external_documents_ids; mod fields_ids_map; @@ -24,7 +26,7 @@ pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::index::Index; -pub use self::search::{Search, SearchResult}; +pub use self::search::{Search, FacetCondition, SearchResult}; pub use self::heed_codec::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, diff --git a/src/search/facet/grammar.pest b/src/search/facet/grammar.pest new file mode 100644 index 000000000..2096517d3 --- /dev/null +++ b/src/search/facet/grammar.pest @@ -0,0 +1,29 @@ +key = _{quoted | word} +value = _{quoted | word} +quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } +string = {char*} +word = ${(LETTER | NUMBER | "_" | "-" | ".")+} + +char = _{ !(PEEK | "\\") ~ ANY + | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") + | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} + +condition = _{between | eq | greater | less | geq | leq | neq} +between = {key ~ value ~ "TO" ~ value} +geq = {key ~ ">=" ~ value} +leq = {key ~ "<=" ~ value} +neq = {key ~ "!=" ~ value} +eq = {key ~ "=" ~ value} +greater = {key ~ ">" ~ value} +less = {key ~ "<" ~ value} + +prgm = {SOI ~ expr ~ EOI} +expr = _{ ( term ~ (operation ~ term)* ) } +term = { ("(" ~ expr ~ ")") | condition | not } +operation = _{ and | or } +and = {"AND"} +or = {"OR"} + +not = {"NOT" ~ term} + +WHITESPACE = _{ " " } diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs new file mode 100644 index 000000000..06f543dcc --- /dev/null +++ b/src/search/facet/mod.rs @@ -0,0 +1,655 @@ +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Bound::{self, Unbounded, Included, Excluded}; +use std::str::FromStr; + +use heed::types::{ByteSlice, DecodeIgnore}; +use log::debug; +use num_traits::Bounded; +use parser::{PREC_CLIMBER, FilterParser}; +use pest::error::{Error as PestError, ErrorVariant}; +use pest::iterators::{Pair, Pairs}; +use pest::Parser; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec}; + +use self::FacetCondition::*; +use self::FacetNumberOperator::*; +use self::parser::Rule; + +mod parser; + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum FacetNumberOperator { + GreaterThan(T), + GreaterThanOrEqual(T), + Equal(T), + NotEqual(T), + LowerThan(T), + LowerThanOrEqual(T), + Between(T, T), +} + +impl FacetNumberOperator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(x) => (LowerThanOrEqual(x), None), + GreaterThanOrEqual(x) => (LowerThan(x), None), + Equal(x) => (NotEqual(x), None), + NotEqual(x) => (Equal(x), None), + LowerThan(x) => (GreaterThanOrEqual(x), None), + LowerThanOrEqual(x) => (GreaterThan(x), None), + Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetStringOperator { + Equal(String), + NotEqual(String), +} + +impl FacetStringOperator { + fn negate(self) -> Self { + match self { + FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), + FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetCondition { + OperatorI64(u8, FacetNumberOperator), + OperatorF64(u8, FacetNumberOperator), + OperatorString(u8, FacetStringOperator), + Or(Box, Box), + And(Box, Box), +} + +fn get_field_id_facet_type<'a>( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + items: &mut Pairs<'a, Rule>, +) -> Result<(u8, FacetType), PestError> +{ + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + let field_id = fields_ids_map + .id(key.as_str()) + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + let facet_type = faceted_fields + .get(&field_id) + .copied() + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not faceted, available faceted attributes are: {}", + key.as_str(), + faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + Ok((field_id, facet_type)) +} + +fn pest_parse(pair: Pair) -> Result> +where T: FromStr, + T::Err: ToString, +{ + match pair.as_str().parse() { + Ok(value) => Ok(value), + Err(e) => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )) + } + } +} + +impl FacetCondition { + pub fn from_str( + rtxn: &heed::RoTxn, + index: &Index, + expression: &str, + ) -> anyhow::Result + { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let lexed = FilterParser::parse(Rule::prgm, expression)?; + FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) + } + + fn from_pairs( + fim: &FieldsIdsMap, + ff: &HashMap, + expression: Pairs, + ) -> anyhow::Result + { + PREC_CLIMBER.climb( + expression, + |pair: Pair| match pair.as_rule() { + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), + Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), + _ => unreachable!(), + }, + |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), + } + }, + ) + } + + fn negate(self) -> FacetCondition { + match self { + OperatorI64(fid, op) => match op.negate() { + (op, None) => OperatorI64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), + }, + OperatorF64(fid, op) => match op.negate() { + (op, None) => OperatorF64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), + }, + OperatorString(fid, op) => OperatorString(fid, op.negate()), + Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), + And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), + } + } + + fn between( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let lvalue = items.next().unwrap(); + let rvalue = items.next().unwrap(); + match ftype { + FacetType::Integer => { + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; + Ok(OperatorI64(fid, Between(lvalue, rvalue))) + }, + FacetType::Float => { + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; + Ok(OperatorF64(fid, Between(lvalue, rvalue))) + }, + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), + FacetType::String => { + Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string()))) + }, + } + } + + fn greater_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn greater_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: format!("invalid operator on a faceted string"), + }, + item_span, + ).into()) + }, + } + } +} + +impl FacetCondition { + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + // We also make sure that we don't decode the data before we are sure we must return it. + let iter = db + .remap_key_type::() + .lazily_decode_data() + .range(rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })) + .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + + fn evaluate_number_operator<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + index: &Index, + db: heed::Database, + field_id: u8, + operator: FacetNumberOperator, + ) -> anyhow::Result + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(T::max_value())), + GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + Equal(val) => (Included(val), Included(val)), + NotEqual(val) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; + return Ok(all_documents_ids - docids); + }, + LowerThan(val) => (Included(T::min_value()), Excluded(val)), + LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_types::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Ok(output) + }, + None => Ok(RoaringBitmap::new()), + } + } + + fn evaluate_string_operator( + rtxn: &heed::RoTxn, + index: &Index, + db: heed::Database, + field_id: u8, + operator: &FacetStringOperator, + ) -> anyhow::Result + { + match operator { + FacetStringOperator::Equal(string) => { + match db.get(rtxn, &(field_id, string))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()) + } + }, + FacetStringOperator::NotEqual(string) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let op = FacetStringOperator::Equal(string.clone()); + let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; + return Ok(all_documents_ids - docids); + }, + } + } + + pub fn evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + ) -> anyhow::Result + { + let db = index.facet_field_id_value_docids; + match self { + OperatorI64(fid, op) => { + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) + }, + OperatorF64(fid, op) => { + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) + }, + OperatorString(fid, op) => { + let db = db.remap_key_type::(); + Self::evaluate_string_operator(rtxn, index, db, *fid, op) + }, + Or(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs | rhs) + }, + And(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs & rhs) + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::update::Settings; + use heed::EnvOpenOptions; + use maplit::hashmap; + + #[test] + fn string() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into())); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); + assert_eq!(condition, expected); + } + + #[test] + fn i64() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = OperatorI64(1, Between(22, 44)); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order + builder.set_faceted_fields(hashmap!{ + "channel".into() => "string".into(), + "timestamp".into() => "integer".into(), + }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), + Box::new(And( + Box::new(OperatorI64(1, Between(22, 44))), + Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))), + )) + ); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), + Box::new(Or( + Box::new(Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + )), + Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))), + )), + ); + assert_eq!(condition, expected); + } +} diff --git a/src/search/facet/parser.rs b/src/search/facet/parser.rs new file mode 100644 index 000000000..0e8bd23ac --- /dev/null +++ b/src/search/facet/parser.rs @@ -0,0 +1,12 @@ +use once_cell::sync::Lazy; +use pest::prec_climber::{Operator, Assoc, PrecClimber}; + +pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { + use Assoc::*; + use Rule::*; + pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) +}); + +#[derive(Parser)] +#[grammar = "search/facet/grammar.pest"] +pub struct FilterParser; diff --git a/src/search.rs b/src/search/mod.rs similarity index 75% rename from src/search.rs rename to src/search/mod.rs index ae2b5d127..af6ccaf26 100644 --- a/src/search.rs +++ b/src/search/mod.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; +use std::fmt; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; @@ -8,17 +9,22 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::query_tokens::{QueryTokens, QueryToken}; use crate::mdfs::Mdfs; +use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; +pub use self::facet::FacetCondition; + // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); +mod facet; + pub struct Search<'a> { query: Option, + facet_condition: Option, offset: usize, limit: usize, rtxn: &'a heed::RoTxn<'a>, @@ -27,7 +33,7 @@ pub struct Search<'a> { impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { - Search { query: None, offset: 0, limit: 20, rtxn, index } + Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } } pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { @@ -45,6 +51,11 @@ impl<'a> Search<'a> { self } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { + self.facet_condition = Some(condition); + self + } + /// Extracts the query words from the query string and returns the DFAs accordingly. /// TODO introduce settings for the number of typos regarding the words lengths. fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { @@ -135,22 +146,44 @@ impl<'a> Search<'a> { pub fn execute(&self) -> anyhow::Result { let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; // Construct the DFAs related to the query words. - let dfas = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => dfas, - _ => { + let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { + Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), + _otherwise => None, + }; + + // We create the original candidates with the facet conditions results. + let facet_candidates = match &self.facet_condition { + Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), + None => None, + }; + + debug!("facet candidates: {:?}", facet_candidates); + + let (candidates, derived_words) = match (facet_candidates, derived_words) { + (Some(mut facet_candidates), Some(derived_words)) => { + let words_candidates = Self::compute_candidates(&derived_words); + facet_candidates.intersect_with(&words_candidates); + (facet_candidates, derived_words) + }, + (None, Some(derived_words)) => { + (Self::compute_candidates(&derived_words), derived_words) + }, + (Some(facet_candidates), None) => { + // If the query is not set or results in no DFAs but + // there is some facet conditions we return a placeholder. + let documents_ids = facet_candidates.iter().take(limit).collect(); + return Ok(SearchResult { documents_ids, ..Default::default() }) + }, + (None, None) => { // If the query is not set or results in no DFAs we return a placeholder. let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); return Ok(SearchResult { documents_ids, ..Default::default() }) }, }; - let derived_words = self.fetch_words_docids(&fst, dfas)?; - let candidates = Self::compute_candidates(&derived_words); - debug!("candidates: {:?}", candidates); // The mana depth first search is a revised DFS that explore @@ -175,6 +208,18 @@ impl<'a> Search<'a> { } } +impl fmt::Debug for Search<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; + f.debug_struct("Search") + .field("query", query) + .field("facet_condition", facet_condition) + .field("offset", offset) + .field("limit", limit) + .finish() + } +} + #[derive(Default)] pub struct SearchResult { pub found_words: HashSet, diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 0c7fa36c6..f8138660b 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -1,10 +1,10 @@ use std::path::PathBuf; -use std::{str, io}; +use std::{str, io, fmt}; use anyhow::Context; -use crate::Index; use heed::EnvOpenOptions; use structopt::StructOpt; +use crate::Index; use Command::*; @@ -89,6 +89,12 @@ enum Command { field_name: String, }, + /// Outputs some facets statistics for the given facet name. + FacetStats { + /// The field name in the document. + field_name: String, + }, + /// Outputs the total size of all the docid-word-positions keys and values. TotalDocidWordPositionsSize, @@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -225,46 +232,140 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: Ok(wtr.flush()?) } +/// Helper function that converts the facet value key to a unique type +/// that can be used to log or display purposes. +fn facet_values_iter<'txn, DC: 'txn, T>( + rtxn: &'txn heed::RoTxn, + db: heed::Database, + field_id: u8, + facet_type: crate::facet::FacetType, + string_fn: impl Fn(&str) -> T + 'txn, + float_fn: impl Fn(u8, f64, f64) -> T + 'txn, + integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, +) -> heed::Result> + 'txn>> +where + DC: heed::BytesDecode<'txn>, +{ + use crate::facet::FacetType; + use crate::heed_codec::facet::{ + FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, + }; + + let iter = db.prefix_iter(&rtxn, &[field_id])?; + match facet_type { + FacetType::String => { + let iter = iter.remap_key_type::() + .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); + Ok(Box::new(iter) as Box>) + }, + FacetType::Float => { + let iter = iter.remap_key_type::() + .map(move |r| r.map(|((_, level, left, right), value)| { + (float_fn(level, left, right), value) + })); + Ok(Box::new(iter)) + }, + FacetType::Integer => { + let iter = iter.remap_key_type::() + .map(move |r| r.map(|((_, level, left, right), value)| { + (integer_fn(level, left, right), value) + })); + Ok(Box::new(iter)) + }, + } +} + +fn facet_number_value_to_string(level: u8, left: T, right: T) -> String { + if level == 0 { + format!("{:?} (level {})", left, level) + } else { + format!("{:?} to {:?} (level {})", left, right, level) + } +} + fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::cmp::Reverse; use std::collections::BinaryHeap; use heed::types::{Str, ByteSlice}; - use crate::heed_codec::BEU32StrCodec; + + let Index { + env: _env, + main, + word_docids, + docid_word_positions, + word_pair_proximity_docids, + facet_field_id_value_docids, + documents, + } = index; let main_name = "main"; let word_docids_name = "word_docids"; let docid_word_positions_name = "docid_word_positions"; + let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let facet_field_id_value_docids_name = "facet_field_id_value_docids"; + let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); if limit > 0 { let words_fst = index.words_fst(rtxn)?; - heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); if heap.len() > limit { heap.pop(); } - if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? { - heap.push(Reverse((documents.len(), format!("documents"), main_name))); - if heap.len() > limit { heap.pop(); } - } - - if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { + if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); if heap.len() > limit { heap.pop(); } } - for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? { + for result in word_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); if heap.len() > limit { heap.pop(); } } - for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? { + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); heap.push(Reverse((value.len(), key, docid_word_positions_name))); if heap.len() > limit { heap.pop(); } } + + for result in word_pair_proximity_docids.remap_data_type::().iter(rtxn)? { + let ((word1, word2, prox), value) = result?; + let key = format!("{} {} {}", word1, word2, prox); + heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); + if heap.len() > limit { heap.pop(); } + } + + let faceted_fields = index.faceted_fields(rtxn)?; + let fields_ids_map = index.fields_ids_map(rtxn)?; + for (field_id, field_type) in faceted_fields { + let facet_name = fields_ids_map.name(field_id).unwrap(); + + let db = facet_field_id_value_docids.remap_data_type::(); + let iter = facet_values_iter( + rtxn, + db, + field_id, + field_type, + |key| key.to_owned(), + facet_number_value_to_string, + facet_number_value_to_string, + )?; + + for result in iter { + let (fvalue, value) = result?; + let key = format!("{} {}", facet_name, fvalue); + heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); + if heap.len() > limit { heap.pop(); } + } + } + + for result in documents.remap_data_type::().iter(rtxn)? { + let (id, value) = result?; + heap.push(Reverse((value.len(), id.to_string(), documents_name))); + if heap.len() > limit { heap.pop(); } + } } let stdout = io::stdout(); @@ -298,10 +399,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec anyhow::Result<()> { - use crate::facet::FacetType; - use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; - use heed::{BytesDecode, Error::Decoding}; - let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields(&rtxn)?; @@ -310,51 +407,78 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam let field_type = faceted_fields.get(&field_id) .with_context(|| format!("field {} is not faceted", field_name))?; - let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; - let iter = match field_type { - FacetType::String => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) as Box> - }, - FacetType::Float => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) - }, - FacetType::Integer => { - let iter = iter - .map(|result| result.and_then(|(key, value)| { - let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; - Ok((key.to_string(), value)) - })); - Box::new(iter) - }, - }; - let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["facet_value", "documents_ids"])?; + wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?; + + let db = index.facet_field_id_value_docids; + let iter = facet_values_iter( + rtxn, + db, + field_id, + *field_type, + |key| key.to_owned(), + facet_number_value_to_string, + facet_number_value_to_string, + )?; for result in iter { let (value, docids) = result?; + let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value, docids])?; + wtr.write_record(&[value, count.to_string(), docids])?; } Ok(wtr.flush()?) } +fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let faceted_fields = index.faceted_fields(&rtxn)?; + + let field_id = fields_ids_map.id(&field_name) + .with_context(|| format!("field {} not found", field_name))?; + let field_type = faceted_fields.get(&field_id) + .with_context(|| format!("field {} is not faceted", field_name))?; + + let db = index.facet_field_id_value_docids; + let iter = facet_values_iter( + rtxn, + db, + field_id, + *field_type, + |_key| 0u8, + |level, _left, _right| level, + |level, _left, _right| level, + )?; + + println!("The database {:?} facet stats", field_name); + + let mut level_size = 0; + let mut current_level = None; + for result in iter { + let (level, _) = result?; + if let Some(current) = current_level { + if current != level { + println!("\tnumber of groups at level {}: {}", current, level_size); + level_size = 0; + } + } + current_level = Some(level); + level_size += 1; + } + + if let Some(current) = current_level { + println!("\tnumber of groups at level {}: {}", current, level_size); + } + + Ok(()) +} + fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { use std::fs::File; use std::io::Write as _; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 447dca8b4..5dc14f97d 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; + let faceted_fields = self.index.faceted_fields(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + // We clean all the faceted documents ids. + for (field_id, _) in faceted_fields { + self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; + } + // Clear the other databases. word_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 1913ac033..b1db4f94c 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -1,4 +1,5 @@ use fst::IntoStreamer; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; @@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; if let Some((key, mut docids)) = iter.next().transpose()? { if key == word.as_ref() { + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; *must_remove = true; - } else { + } else if docids.len() != previous_len { iter.put_current(key, &docids)?; } } @@ -168,27 +170,37 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. - let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?; + let mut iter = word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; while let Some(result) = iter.next() { - let ((w1, w2, prox), mut docids) = result?; + let (bytes, mut docids) = result?; + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; - } else { - iter.put_current(&(w1, w2, prox), &docids)?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; } } drop(iter); + // Remove the documents ids from the faceted documents ids. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + for (field_id, _) in faceted_fields { + let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + } + // We delete the documents ids that are under the facet field id values. let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (bytes, mut docids) = result?; + let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; - } else { + } else if docids.len() != previous_len { iter.put_current(bytes, &docids)?; } } diff --git a/src/update/facets.rs b/src/update/facets.rs new file mode 100644 index 000000000..e26f030df --- /dev/null +++ b/src/update/facets.rs @@ -0,0 +1,256 @@ +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesEncode, Error}; +use log::debug; +use num_traits::{Bounded, Zero}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::Index; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; + +pub struct Facets<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, +} + +impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { + Facets { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + level_group_size: NonZeroUsize::new(4).unwrap(), + min_level_size: NonZeroUsize::new(5).unwrap(), + } + } + + pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.min_level_size = value; + self + } + + pub fn execute(self) -> anyhow::Result<()> { + // We get the faceted fields to be able to create the facet levels. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + for (field_id, facet_type) in faceted_fields { + let (content, documents_ids) = match facet_type { + FacetType::Integer => { + clear_field_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + + (Some(content), documents_ids) + }, + FacetType::Float => { + clear_field_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( + self.wtxn, + self.index.facet_field_id_value_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + + (Some(content), documents_ids) + }, + FacetType::String => { + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + (None, documents_ids) + }, + }; + + if let Some(content) = content { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + } + + self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; + } + + Ok(()) + } +} + +fn clear_field_levels<'t, T: 't, KC>( + wtxn: &'t mut heed::RwTxn, + db: heed::Database, + field_id: u8, +) -> heed::Result<()> +where + T: Copy + Bounded, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, +{ + let left = (field_id, 1, T::min_value(), T::min_value()); + let right = (field_id, u8::MAX, T::max_value(), T::max_value()); + let range = left..=right; + db.remap_key_type::().delete_range(wtxn, &range).map(drop) +} + +fn compute_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: u8, +) -> anyhow::Result> +where + T: Copy + PartialEq + PartialOrd + Bounded + Zero, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, +{ + let first_level_size = db.prefix_iter(rtxn, &[field_id])? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; + + let level_0_range = { + let left = (field_id, 0, T::min_value(), T::min_value()); + let right = (field_id, 0, T::max_value(), T::max_value()); + left..=right + }; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + for (level, group_size) in group_size_iter { + let mut left = T::zero(); + let mut right = T::zero(); + let mut group_docids = RoaringBitmap::new(); + + let db = db.remap_key_type::(); + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; + + if i == 0 { + left = value; + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } + + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } + + if !group_docids.is_empty() { + write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn compute_faceted_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: u8, +) -> anyhow::Result +{ + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &[field_id])? { + let (_key, docids) = result?; + documents_ids.union_with(&docids); + } + Ok(documents_ids) +} + +fn write_entry( + writer: &mut Writer, + field_id: u8, + level: u8, + left: T, + right: T, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +where + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, +{ + let key = (field_id, level, left, right); + let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index fe51c6b2b..4a3ec43f9 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; +use std::num::NonZeroUsize; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -15,7 +16,7 @@ use rayon::prelude::*; use rayon::ThreadPool; use crate::index::Index; -use crate::update::UpdateIndexingStep; +use crate::update::{Facets, UpdateIndexingStep}; use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, @@ -31,12 +32,12 @@ mod store; mod transform; #[derive(Debug, Copy, Clone)] -enum WriteMethod { +pub enum WriteMethod { Append, GetMergePut, } -fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { +pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { let mut builder = Writer::builder(); builder.compression_type(typ); if let Some(level) = level { @@ -45,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Re builder.build(file) } -fn create_sorter( +pub fn create_sorter( merge: MergeFn, chunk_compression_type: CompressionType, chunk_compression_level: Option, @@ -71,7 +72,7 @@ fn create_sorter( builder.build() } -fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { +pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -82,13 +83,13 @@ fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow: Reader::new(file).map_err(Into::into) } -fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { +pub fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() } -fn merge_into_lmdb_database( +pub fn merge_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, sources: Vec>, @@ -132,7 +133,7 @@ fn merge_into_lmdb_database( Ok(()) } -fn write_into_lmdb_database( +pub fn write_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, mut reader: Reader, @@ -157,7 +158,7 @@ fn write_into_lmdb_database( match iter.next().transpose()? { Some((key, old_val)) if key == k => { let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).expect("merge failed"); + let val = merge(k, &vals)?; iter.put_current(k, &val)?; }, _ => { @@ -207,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + facet_level_group_size: Option, + facet_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -225,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_level: None, chunk_fusing_shrink_size: None, thread_pool: None, + facet_level_group_size: None, + facet_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -308,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { thread_pool: self.thread_pool, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; + debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); - let _deleted_documents_count = deletion_builder.execute()?; + let deleted_documents_count = deletion_builder.execute()?; + debug!("{} documents actually deleted", deleted_documents_count); } let mmap; @@ -327,7 +334,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, - FacetValuesDocids, + FacetLevel0ValuesDocids, } let faceted_fields = self.index.faceted_fields(self.wtxn)?; @@ -427,7 +434,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), ( - DatabaseType::FacetValuesDocids, + DatabaseType::FacetLevel0ValuesDocids, facet_field_value_docids_readers, facet_field_value_docids_merge, ), @@ -557,7 +564,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, - DatabaseType::FacetValuesDocids => { + DatabaseType::FacetLevel0ValuesDocids => { debug!("Writing the facet values docids into LMDB on disk..."); let db = *self.index.facet_field_id_value_docids.as_polymorph(); write_into_lmdb_database( @@ -577,6 +584,18 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } + let mut builder = Facets::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.facet_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.facet_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 9c75f10fe..25c343910 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -19,7 +19,7 @@ use tempfile::tempfile; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; @@ -337,8 +337,8 @@ impl Store { for ((field_id, value), docids) in iter { let result = match value { String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), - Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), - Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), + Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), + Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned), }; let key = result.context("could not serialize facet key")?; let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) @@ -399,7 +399,7 @@ impl Store { // We skip documents that must not be indexed by this thread. if count % num_threads == thread_index { // This is a log routine that we do every `log_every_n` documents. - if log_every_n.map_or(false, |len| count % len == 0) { + if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); progress_callback(UpdateIndexingStep::IndexDocuments { documents_seen: count, @@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result Ok(()), Value::Bool(b) => Ok(output.push(Integer(*b as i64))), Value::Number(number) => match ftype { - FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), + FacetType::String => { + let string = SmallString32::from(number.to_string()); + Ok(output.push(String(string))) + }, FacetType::Float => match number.as_f64() { Some(float) => Ok(output.push(Float(OrderedFloat(float)))), None => bail!("invalid facet type, expecting {} found integer", ftype), @@ -586,7 +589,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result { - let string = string.trim(); + let string = string.trim().to_lowercase(); if string.is_empty() { return Ok(()) } match ftype { FacetType::String => { diff --git a/src/update/mod.rs b/src/update/mod.rs index 75724269a..d05396f00 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -1,6 +1,7 @@ mod available_documents_ids; mod clear_documents; mod delete_documents; +mod facets; mod index_documents; mod settings; mod update_builder; @@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::facets::Facets; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/src/update/settings.rs b/src/update/settings.rs index 03f184ef6..cddd68ca3 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -412,6 +412,23 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); + // Only count the field_id 0 and level 0 facet values. + let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); + assert_eq!(count, 3); + drop(rtxn); + + // Index a little more documents with new and current facets values. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); + assert_eq!(count, 4); drop(rtxn); } } diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index 67ea04bfc..b973bd535 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> { builder } + + pub fn facets<'t, 'u, 'i>( + self, + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> Facets<'t, 'u, 'i> + { + let mut builder = Facets::new(wtxn, index); + + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + + builder + } }