Merge pull request #38 from meilisearch/facet-queries

Introduce a facet filter system
This commit is contained in:
Clément Renault 2020-11-28 17:21:07 +01:00 committed by GitHub
commit 6120f6590b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1705 additions and 179 deletions

143
Cargo.lock generated
View File

@ -45,6 +45,27 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "block-buffer"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b"
dependencies = [
"block-padding",
"byte-tools",
"byteorder",
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5"
dependencies = [
"byte-tools",
]
[[package]]
name = "bstr"
version = "0.2.13"
@ -63,6 +84,12 @@ version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820"
[[package]]
name = "byte-tools"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
[[package]]
name = "byteorder"
version = "1.3.4"
@ -285,12 +312,27 @@ dependencies = [
"memchr",
]
[[package]]
name = "digest"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5"
dependencies = [
"generic-array",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "fake-simd"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
[[package]]
name = "flate2"
version = "1.0.17"
@ -324,6 +366,15 @@ dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec"
dependencies = [
"typenum",
]
[[package]]
name = "getrandom"
version = "0.1.14"
@ -378,9 +429,9 @@ dependencies = [
[[package]]
name = "heed"
version = "0.10.1"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797"
checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616"
dependencies = [
"byteorder",
"heed-traits",
@ -617,9 +668,12 @@ dependencies = [
"maplit",
"memmap",
"near-proximity",
"num-traits",
"obkv",
"once_cell",
"ordered-float",
"pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)",
"pest_derive",
"rayon",
"ringtail",
"roaring",
@ -675,9 +729,9 @@ dependencies = [
[[package]]
name = "num-traits"
version = "0.2.12"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
@ -716,6 +770,12 @@ version = "11.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c"
[[package]]
name = "opaque-debug"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
[[package]]
name = "ordered-float"
version = "2.0.0"
@ -741,6 +801,57 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
[[package]]
name = "pest"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pest"
version = "2.1.3"
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pest_derive"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
dependencies = [
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"pest_generator",
]
[[package]]
name = "pest_generator"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
dependencies = [
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"pest_meta",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pest_meta"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
dependencies = [
"maplit",
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"sha-1",
]
[[package]]
name = "pkg-config"
version = "0.3.19"
@ -1025,6 +1136,18 @@ dependencies = [
"serde",
]
[[package]]
name = "sha-1"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df"
dependencies = [
"block-buffer",
"digest",
"fake-simd",
"opaque-debug",
]
[[package]]
name = "slice-group-by"
version = "0.2.6"
@ -1233,6 +1356,18 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "typenum"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33"
[[package]]
name = "ucd-trie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
[[package]]
name = "unicode-bidi"
version = "0.3.4"

View File

@ -14,13 +14,14 @@ flate2 = "1.0.17"
fst = "0.4.4"
fxhash = "0.2.1"
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] }
human_format = "1.0.3"
jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.3"
memmap = "0.7.0"
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
num-traits = "0.2.14"
obkv = "0.1.0"
once_cell = "1.4.0"
ordered-float = "2.0.0"
@ -36,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he
tempfile = "3.1.0"
uuid = { version = "0.8.1", features = ["v4"] }
# facet filter parser
pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" }
pest_derive = "2.1.0"
# documents words self-join
itertools = "0.9.0"

70
http-ui/Cargo.lock generated
View File

@ -654,9 +654,9 @@ dependencies = [
[[package]]
name = "heed"
version = "0.10.1"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797"
checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616"
dependencies = [
"byteorder",
"heed-traits",
@ -934,6 +934,12 @@ dependencies = [
"cfg-if 0.1.10",
]
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "matches"
version = "0.1.8"
@ -987,9 +993,12 @@ dependencies = [
"log",
"memmap",
"near-proximity",
"num-traits",
"obkv",
"once_cell",
"ordered-float",
"pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)",
"pest_derive",
"rayon",
"ringtail",
"roaring",
@ -1231,6 +1240,57 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
[[package]]
name = "pest"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pest"
version = "2.1.3"
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pest_derive"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
dependencies = [
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"pest_generator",
]
[[package]]
name = "pest_generator"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
dependencies = [
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"pest_meta",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pest_meta"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
dependencies = [
"maplit",
"pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"sha-1 0.8.2",
]
[[package]]
name = "pin-project"
version = "0.4.27"
@ -2024,6 +2084,12 @@ version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33"
[[package]]
name = "ucd-trie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
[[package]]
name = "unicase"
version = "2.6.0"

View File

@ -8,7 +8,7 @@ edition = "2018"
[dependencies]
anyhow = "1.0.28"
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" }
heed = "0.10.1"
heed = "0.10.4"
memmap = "0.7.0"
milli = { path = ".." }
once_cell = "1.4.1"

View File

@ -1,8 +1,9 @@
var request = null;
var timeoutID = null;
$('#search').on('input', function () {
var query = $(this).val();
$('#query, #facet').on('input', function () {
var query = $('#query').val();
var facet = $('#facet').val();
var timeoutMs = 100;
if (timeoutID !== null) {
@ -14,7 +15,7 @@ $('#search').on('input', function () {
type: "POST",
url: "query",
contentType: 'application/json',
data: JSON.stringify({ 'query': query }),
data: JSON.stringify({ 'query': query, 'facetCondition': facet }),
contentType: 'application/json',
success: function (data, textStatus, request) {
results.innerHTML = '';
@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) {
// We trigger the input when we load the script, this way
// we execute a placeholder search when the input is empty.
$(window).on('load', function () {
$('#search').trigger('input');
$('#query').trigger('input');
});

View File

@ -2,6 +2,7 @@ use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::fs::{File, create_dir_all};
use std::net::SocketAddr;
use std::num::NonZeroUsize;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
@ -28,7 +29,7 @@ use warp::{Filter, http::Response};
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
@ -196,6 +197,7 @@ enum UpdateMeta {
DocumentsAddition { method: String, format: String },
ClearDocuments,
Settings(Settings),
Facets(Facets),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -231,6 +233,14 @@ struct Settings {
faceted_attributes: Option<HashMap<String, String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
struct Facets {
level_group_size: Option<NonZeroUsize>,
min_level_size: Option<NonZeroUsize>,
}
// Any value that is present is considered Some value, including null.
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where T: Deserialize<'de>,
@ -399,6 +409,21 @@ async fn main() -> anyhow::Result<()> {
Ok(_count) => wtxn.commit().map_err(Into::into),
Err(e) => Err(e.into())
}
},
UpdateMeta::Facets(levels) => {
// We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?;
let mut builder = update_builder.facets(&mut wtxn, &index_cloned);
if let Some(value) = levels.level_group_size {
builder.level_group_size(value);
}
if let Some(value) = levels.min_level_size {
builder.min_level_size(value);
}
match builder.execute() {
Ok(()) => wtxn.commit().map_err(Into::into),
Err(e) => Err(e.into())
}
}
};
@ -550,9 +575,12 @@ async fn main() -> anyhow::Result<()> {
.body(include_str!("../public/logo-black.svg"))
);
#[derive(Deserialize)]
#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
struct QueryBody {
query: Option<String>,
facet_condition: Option<String>,
}
let disable_highlighting = opt.disable_highlighting;
@ -569,6 +597,12 @@ async fn main() -> anyhow::Result<()> {
if let Some(query) = query.query {
search.query(query);
}
if let Some(condition) = query.facet_condition {
if !condition.trim().is_empty() {
let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap();
search.facet_condition(condition);
}
}
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
@ -751,6 +785,19 @@ async fn main() -> anyhow::Result<()> {
Ok(warp::reply())
});
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let change_facet_levels_route = warp::filters::method::post()
.and(warp::path!("facet-level-sizes"))
.and(warp::body::json())
.map(move |levels: Facets| {
let meta = UpdateMeta::Facets(levels);
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
eprintln!("update {} registered", update_id);
warp::reply()
});
let update_ws_route = warp::ws()
.and(warp::path!("updates" / "ws"))
.map(move |ws: warp::ws::Ws| {
@ -799,6 +846,7 @@ async fn main() -> anyhow::Result<()> {
.or(indexing_json_stream_route)
.or(clearing_route)
.or(change_settings_route)
.or(change_facet_levels_route)
.or(update_ws_route);
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;

View File

@ -55,7 +55,8 @@
<div class="level-left">
<div class="level-item">
<div class="field has-addons has-addons-right">
<input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney">
<input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney">
<input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800">
</div>
</div>
<div class="level-item"></div>

View File

@ -0,0 +1,86 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
// TODO do not de/serialize right bound when level = 0
pub struct FacetLevelValueF64Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
type DItem = (u8, u8, f64, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (level, bytes) = bytes.split_first()?;
let (left, right) = if *level != 0 {
let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?;
let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?;
(left, right)
} else {
let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
(left, left)
};
Some((*field_id, *level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
type EItem = (u8, u8, f64, f64);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 32];
let len = if *level != 0 {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
let bytes = f64_into_bytes(*right)?;
buffer[8..16].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[16..24].copy_from_slice(&bytes[..]);
let bytes = right.to_be_bytes();
buffer[24..].copy_from_slice(&bytes[..]);
32 // length
} else {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
16 // length
};
let mut bytes = Vec::with_capacity(len + 2);
bytes.push(*field_id);
bytes.push(*level);
bytes.extend_from_slice(&buffer[..len]);
Some(Cow::Owned(bytes))
}
}
#[cfg(test)]
mod tests {
use heed::{BytesEncode, BytesDecode};
use super::*;
#[test]
fn globally_ordered_f64() {
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0));
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0));
}
}

View File

@ -0,0 +1,43 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
pub struct FacetLevelValueI64Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec {
type DItem = (u8, u8, i64, i64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (level, bytes) = bytes.split_first()?;
let left = bytes[..8].try_into().map(i64_from_bytes).ok()?;
let right = if *level != 0 {
bytes[8..].try_into().map(i64_from_bytes).ok()?
} else {
left
};
Some((*field_id, *level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueI64Codec {
type EItem = (u8, u8, i64, i64);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let left = i64_into_bytes(*left);
let right = i64_into_bytes(*right);
let mut bytes = Vec::with_capacity(2 + left.len() + right.len());
bytes.push(*field_id);
bytes.push(*level);
bytes.extend_from_slice(&left[..]);
if *level != 0 {
bytes.extend_from_slice(&right[..]);
}
Some(Cow::Owned(bytes))
}
}

View File

@ -1,50 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
pub struct FacetValueF64Codec;
impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec {
type DItem = (u8, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, buffer) = bytes.split_first()?;
let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?;
Some((*field_id, value))
}
}
impl heed::BytesEncode<'_> for FacetValueF64Codec {
type EItem = (u8, f64);
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 16];
// Write the globally ordered float.
let bytes = f64_into_bytes(*value)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back.
let bytes = value.to_be_bytes();
buffer[8..].copy_from_slice(&bytes[..]);
let mut bytes = Vec::with_capacity(buffer.len() + 1);
bytes.push(*field_id);
bytes.extend_from_slice(&buffer[..]);
Some(Cow::Owned(bytes))
}
}
#[cfg(test)]
mod tests {
use heed::{BytesEncode, BytesDecode};
use super::*;
#[test]
fn globally_ordered_f64() {
let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap();
let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, value), (3, -32.0));
}
}

View File

@ -1,28 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
pub struct FacetValueI64Codec;
impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec {
type DItem = (u8, i64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, buffer) = bytes.split_first()?;
let value = buffer.try_into().map(i64_from_bytes).ok()?;
Some((*field_id, value))
}
}
impl heed::BytesEncode<'_> for FacetValueI64Codec {
type EItem = (u8, i64);
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let value = i64_into_bytes(*value);
let mut bytes = Vec::with_capacity(value.len() + 1);
bytes.push(*field_id);
bytes.extend_from_slice(&value[..]);
Some(Cow::Owned(bytes))
}
}

View File

@ -1,7 +1,7 @@
mod facet_value_f64_codec;
mod facet_value_i64_codec;
mod facet_level_value_f64_codec;
mod facet_level_value_i64_codec;
mod facet_value_string_codec;
pub use self::facet_value_f64_codec::FacetValueF64Codec;
pub use self::facet_value_i64_codec::FacetValueI64Codec;
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec;
pub use self::facet_value_string_codec::FacetValueStringCodec;

View File

@ -18,6 +18,7 @@ use crate::{
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -224,6 +225,27 @@ impl Index {
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
}
/* faceted documents ids */
/// Writes the documents ids that are faceted under this field id.
pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> {
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
/// Retrieve all the documents ids that faceted under this field id.
pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/* words fst */
/// Writes the FST which is the words dictionnary of the engine.

View File

@ -1,3 +1,5 @@
#[macro_use] extern crate pest_derive;
mod criterion;
mod external_documents_ids;
mod fields_ids_map;
@ -24,7 +26,7 @@ pub use self::criterion::{Criterion, default_criteria};
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::index::Index;
pub use self::search::{Search, SearchResult};
pub use self::search::{Search, FacetCondition, SearchResult};
pub use self::heed_codec::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,

View File

@ -0,0 +1,29 @@
key = _{quoted | word}
value = _{quoted | word}
quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP }
string = {char*}
word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
char = _{ !(PEEK | "\\") ~ ANY
| "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
| "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
condition = _{between | eq | greater | less | geq | leq | neq}
between = {key ~ value ~ "TO" ~ value}
geq = {key ~ ">=" ~ value}
leq = {key ~ "<=" ~ value}
neq = {key ~ "!=" ~ value}
eq = {key ~ "=" ~ value}
greater = {key ~ ">" ~ value}
less = {key ~ "<" ~ value}
prgm = {SOI ~ expr ~ EOI}
expr = _{ ( term ~ (operation ~ term)* ) }
term = { ("(" ~ expr ~ ")") | condition | not }
operation = _{ and | or }
and = {"AND"}
or = {"OR"}
not = {"NOT" ~ term}
WHITESPACE = _{ " " }

655
src/search/facet/mod.rs Normal file
View File

@ -0,0 +1,655 @@
use std::collections::HashMap;
use std::fmt::Debug;
use std::ops::Bound::{self, Unbounded, Included, Excluded};
use std::str::FromStr;
use heed::types::{ByteSlice, DecodeIgnore};
use log::debug;
use num_traits::Bounded;
use parser::{PREC_CLIMBER, FilterParser};
use pest::error::{Error as PestError, ErrorVariant};
use pest::iterators::{Pair, Pairs};
use pest::Parser;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::FacetValueStringCodec;
use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec};
use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec};
use self::FacetCondition::*;
use self::FacetNumberOperator::*;
use self::parser::Rule;
mod parser;
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum FacetNumberOperator<T> {
GreaterThan(T),
GreaterThanOrEqual(T),
Equal(T),
NotEqual(T),
LowerThan(T),
LowerThanOrEqual(T),
Between(T, T),
}
impl<T> FacetNumberOperator<T> {
/// This method can return two operations in case it must express
/// an OR operation for the between case (i.e. `TO`).
fn negate(self) -> (Self, Option<Self>) {
match self {
GreaterThan(x) => (LowerThanOrEqual(x), None),
GreaterThanOrEqual(x) => (LowerThan(x), None),
Equal(x) => (NotEqual(x), None),
NotEqual(x) => (Equal(x), None),
LowerThan(x) => (GreaterThanOrEqual(x), None),
LowerThanOrEqual(x) => (GreaterThan(x), None),
Between(x, y) => (LowerThan(x), Some(GreaterThan(y))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetStringOperator {
Equal(String),
NotEqual(String),
}
impl FacetStringOperator {
fn negate(self) -> Self {
match self {
FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x),
FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetCondition {
OperatorI64(u8, FacetNumberOperator<i64>),
OperatorF64(u8, FacetNumberOperator<f64>),
OperatorString(u8, FacetStringOperator),
Or(Box<Self>, Box<Self>),
And(Box<Self>, Box<Self>),
}
fn get_field_id_facet_type<'a>(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
items: &mut Pairs<'a, Rule>,
) -> Result<(u8, FacetType), PestError<Rule>>
{
// lexing ensures that we at least have a key
let key = items.next().unwrap();
let field_id = fields_ids_map
.id(key.as_str())
.ok_or_else(|| {
PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ")
),
},
key.as_span(),
)
})?;
let facet_type = faceted_fields
.get(&field_id)
.copied()
.ok_or_else(|| {
PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` is not faceted, available faceted attributes are: {}",
key.as_str(),
faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ")
),
},
key.as_span(),
)
})?;
Ok((field_id, facet_type))
}
fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>>
where T: FromStr,
T::Err: ToString,
{
match pair.as_str().parse() {
Ok(value) => Ok(value),
Err(e) => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError { message: e.to_string() },
pair.as_span(),
))
}
}
}
impl FacetCondition {
pub fn from_str(
rtxn: &heed::RoTxn,
index: &Index,
expression: &str,
) -> anyhow::Result<FacetCondition>
{
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let lexed = FilterParser::parse(Rule::prgm, expression)?;
FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed)
}
fn from_pairs(
fim: &FieldsIdsMap,
ff: &HashMap<u8, FacetType>,
expression: Pairs<Rule>,
) -> anyhow::Result<Self>
{
PREC_CLIMBER.climb(
expression,
|pair: Pair<Rule>| match pair.as_rule() {
Rule::greater => Ok(Self::greater_than(fim, ff, pair)?),
Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?),
Rule::eq => Ok(Self::equal(fim, ff, pair)?),
Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()),
Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?),
Rule::less => Ok(Self::lower_than(fim, ff, pair)?),
Rule::between => Ok(Self::between(fim, ff, pair)?),
Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()),
Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()),
Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
_ => unreachable!(),
},
|lhs: anyhow::Result<Self>, op: Pair<Rule>, rhs: anyhow::Result<Self>| {
match op.as_rule() {
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
_ => unreachable!(),
}
},
)
}
fn negate(self) -> FacetCondition {
match self {
OperatorI64(fid, op) => match op.negate() {
(op, None) => OperatorI64(fid, op),
(a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))),
},
OperatorF64(fid, op) => match op.negate() {
(op, None) => OperatorF64(fid, op),
(a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))),
},
OperatorString(fid, op) => OperatorString(fid, op.negate()),
Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())),
And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())),
}
}
fn between(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let lvalue = items.next().unwrap();
let rvalue = items.next().unwrap();
match ftype {
FacetType::Integer => {
let lvalue = pest_parse(lvalue)?;
let rvalue = pest_parse(rvalue)?;
Ok(OperatorI64(fid, Between(lvalue, rvalue)))
},
FacetType::Float => {
let lvalue = pest_parse(lvalue)?;
let rvalue = pest_parse(rvalue)?;
Ok(OperatorF64(fid, Between(lvalue, rvalue)))
},
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: format!("invalid operator on a faceted string"),
},
item_span,
).into())
},
}
}
fn equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))),
FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))),
FacetType::String => {
Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string())))
},
}
}
fn greater_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))),
FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))),
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: format!("invalid operator on a faceted string"),
},
item_span,
).into())
},
}
}
fn greater_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))),
FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))),
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: format!("invalid operator on a faceted string"),
},
item_span,
).into())
},
}
}
fn lower_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))),
FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))),
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: format!("invalid operator on a faceted string"),
},
item_span,
).into())
},
}
}
fn lower_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<u8, FacetType>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))),
FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))),
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: format!("invalid operator on a faceted string"),
},
item_span,
).into())
},
}
}
}
impl FacetCondition {
/// Aggregates the documents ids that are part of the specified range automatically
/// going deeper through the levels.
fn explore_facet_levels<'t, T: 't, KC>(
rtxn: &'t heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
level: u8,
left: Bound<T>,
right: Bound<T>,
output: &mut RoaringBitmap,
) -> anyhow::Result<()>
where
T: Copy + PartialEq + PartialOrd + Bounded + Debug,
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
{
match (left, right) {
// If the request is an exact value we must go directly to the deepest level.
(Included(l), Included(r)) if l == r && level > 0 => {
return Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, 0, left, right, output);
},
// lower TO upper when lower > upper must return no result
(Included(l), Included(r)) if l > r => return Ok(()),
(Included(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Included(r)) if l >= r => return Ok(()),
(_, _) => (),
}
let mut left_found = None;
let mut right_found = None;
// We must create a custom iterator to be able to iterate over the
// requested range as the range iterator cannot express some conditions.
let left_bound = match left {
Included(left) => Included((field_id, level, left, T::min_value())),
Excluded(left) => Excluded((field_id, level, left, T::min_value())),
Unbounded => Unbounded,
};
let right_bound = Included((field_id, level, T::max_value(), T::max_value()));
// We also make sure that we don't decode the data before we are sure we must return it.
let iter = db
.remap_key_type::<KC>()
.lazily_decode_data()
.range(rtxn, &(left_bound, right_bound))?
.take_while(|r| r.as_ref().map_or(true, |((.., r), _)| {
match right {
Included(right) => *r <= right,
Excluded(right) => *r < right,
Unbounded => true,
}
}))
.map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data))));
debug!("Iterating between {:?} and {:?} (level {})", left, right, level);
for (i, result) in iter.enumerate() {
let ((_fid, level, l, r), docids) = result?;
debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
output.union_with(&docids);
// We save the leftest and rightest bounds we actually found at this level.
if i == 0 { left_found = Some(l); }
right_found = Some(r);
}
// Can we go deeper?
let deeper_level = match level.checked_sub(1) {
Some(level) => level,
None => return Ok(()),
};
// We must refine the left and right bounds of this range by retrieving the
// missing part in a deeper level.
match left_found.zip(right_found) {
Some((left_found, right_found)) => {
// If the bound is satisfied we avoid calling this function again.
if !matches!(left, Included(l) if l == left_found) {
let sub_right = Excluded(left_found);
debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level);
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, sub_right, output)?;
}
if !matches!(right, Included(r) if r == right_found) {
let sub_left = Excluded(right_found);
debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level);
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, sub_left, right, output)?;
}
},
None => {
// If we found nothing at this level it means that we must find
// the same bounds but at a deeper, more precise level.
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, right, output)?;
},
}
Ok(())
}
fn evaluate_number_operator<'t, T: 't, KC>(
rtxn: &'t heed::RoTxn,
index: &Index,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
operator: FacetNumberOperator<T>,
) -> anyhow::Result<RoaringBitmap>
where
T: Copy + PartialEq + PartialOrd + Bounded + Debug,
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
{
// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
let (left, right) = match operator {
GreaterThan(val) => (Excluded(val), Included(T::max_value())),
GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())),
Equal(val) => (Included(val), Included(val)),
NotEqual(val) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let docids = Self::evaluate_number_operator::<T, KC>(rtxn, index, db, field_id, Equal(val))?;
return Ok(all_documents_ids - docids);
},
LowerThan(val) => (Included(T::min_value()), Excluded(val)),
LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)),
Between(left, right) => (Included(left), Included(right)),
};
// Ask for the biggest value that can exist for this specific field, if it exists
// that's fine if it don't, the value just before will be returned instead.
let biggest_level = db
.remap_types::<KC, DecodeIgnore>()
.get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))?
.and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None });
match biggest_level {
Some(level) => {
let mut output = RoaringBitmap::new();
Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, level, left, right, &mut output)?;
Ok(output)
},
None => Ok(RoaringBitmap::new()),
}
}
fn evaluate_string_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: u8,
operator: &FacetStringOperator,
) -> anyhow::Result<RoaringBitmap>
{
match operator {
FacetStringOperator::Equal(string) => {
match db.get(rtxn, &(field_id, string))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new())
}
},
FacetStringOperator::NotEqual(string) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let op = FacetStringOperator::Equal(string.clone());
let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?;
return Ok(all_documents_ids - docids);
},
}
}
pub fn evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
) -> anyhow::Result<RoaringBitmap>
{
let db = index.facet_field_id_value_docids;
match self {
OperatorI64(fid, op) => {
Self::evaluate_number_operator::<i64, FacetLevelValueI64Codec>(rtxn, index, db, *fid, *op)
},
OperatorF64(fid, op) => {
Self::evaluate_number_operator::<f64, FacetLevelValueF64Codec>(rtxn, index, db, *fid, *op)
},
OperatorString(fid, op) => {
let db = db.remap_key_type::<FacetValueStringCodec>();
Self::evaluate_string_operator(rtxn, index, db, *fid, op)
},
Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs | rhs)
},
And(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs & rhs)
},
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::update::Settings;
use heed::EnvOpenOptions;
use maplit::hashmap;
#[test]
fn string() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() });
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into()));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into()));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into()));
assert_eq!(condition, expected);
}
#[test]
fn i64() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() });
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
let expected = OperatorI64(1, Between(22, 44));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or(
Box::new(OperatorI64(1, LowerThan(22))),
Box::new(OperatorI64(1, GreaterThan(44))),
);
assert_eq!(condition, expected);
}
#[test]
fn parentheses() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "integer".into(),
});
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(
&rtxn, &index,
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))),
Box::new(And(
Box::new(OperatorI64(1, Between(22, 44))),
Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))),
))
);
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(
&rtxn, &index,
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))),
Box::new(Or(
Box::new(Or(
Box::new(OperatorI64(1, LowerThan(22))),
Box::new(OperatorI64(1, GreaterThan(44))),
)),
Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))),
)),
);
assert_eq!(condition, expected);
}
}

View File

@ -0,0 +1,12 @@
use once_cell::sync::Lazy;
use pest::prec_climber::{Operator, Assoc, PrecClimber};
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
use Assoc::*;
use Rule::*;
pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)])
});
#[derive(Parser)]
#[grammar = "search/facet/grammar.pest"]
pub struct FilterParser;

View File

@ -1,5 +1,6 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::fmt;
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
@ -8,17 +9,22 @@ use log::debug;
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
use crate::query_tokens::{QueryTokens, QueryToken};
use crate::mdfs::Mdfs;
use crate::query_tokens::{QueryTokens, QueryToken};
use crate::{Index, DocumentId};
pub use self::facet::FacetCondition;
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod facet;
pub struct Search<'a> {
query: Option<String>,
facet_condition: Option<FacetCondition>,
offset: usize,
limit: usize,
rtxn: &'a heed::RoTxn<'a>,
@ -27,7 +33,7 @@ pub struct Search<'a> {
impl<'a> Search<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
Search { query: None, offset: 0, limit: 20, rtxn, index }
Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index }
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
@ -45,6 +51,11 @@ impl<'a> Search<'a> {
self
}
pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> {
self.facet_condition = Some(condition);
self
}
/// Extracts the query words from the query string and returns the DFAs accordingly.
/// TODO introduce settings for the number of typos regarding the words lengths.
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
@ -135,22 +146,44 @@ impl<'a> Search<'a> {
pub fn execute(&self) -> anyhow::Result<SearchResult> {
let limit = self.limit;
let fst = self.index.words_fst(self.rtxn)?;
// Construct the DFAs related to the query words.
let dfas = match self.query.as_deref().map(Self::generate_query_dfas) {
Some(dfas) if !dfas.is_empty() => dfas,
_ => {
let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) {
Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?),
_otherwise => None,
};
// We create the original candidates with the facet conditions results.
let facet_candidates = match &self.facet_condition {
Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?),
None => None,
};
debug!("facet candidates: {:?}", facet_candidates);
let (candidates, derived_words) = match (facet_candidates, derived_words) {
(Some(mut facet_candidates), Some(derived_words)) => {
let words_candidates = Self::compute_candidates(&derived_words);
facet_candidates.intersect_with(&words_candidates);
(facet_candidates, derived_words)
},
(None, Some(derived_words)) => {
(Self::compute_candidates(&derived_words), derived_words)
},
(Some(facet_candidates), None) => {
// If the query is not set or results in no DFAs but
// there is some facet conditions we return a placeholder.
let documents_ids = facet_candidates.iter().take(limit).collect();
return Ok(SearchResult { documents_ids, ..Default::default() })
},
(None, None) => {
// If the query is not set or results in no DFAs we return a placeholder.
let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect();
return Ok(SearchResult { documents_ids, ..Default::default() })
},
};
let derived_words = self.fetch_words_docids(&fst, dfas)?;
let candidates = Self::compute_candidates(&derived_words);
debug!("candidates: {:?}", candidates);
// The mana depth first search is a revised DFS that explore
@ -175,6 +208,18 @@ impl<'a> Search<'a> {
}
}
impl fmt::Debug for Search<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self;
f.debug_struct("Search")
.field("query", query)
.field("facet_condition", facet_condition)
.field("offset", offset)
.field("limit", limit)
.finish()
}
}
#[derive(Default)]
pub struct SearchResult {
pub found_words: HashSet<String>,

View File

@ -1,10 +1,10 @@
use std::path::PathBuf;
use std::{str, io};
use std::{str, io, fmt};
use anyhow::Context;
use crate::Index;
use heed::EnvOpenOptions;
use structopt::StructOpt;
use crate::Index;
use Command::*;
@ -89,6 +89,12 @@ enum Command {
field_name: String,
},
/// Outputs some facets statistics for the given facet name.
FacetStats {
/// The field name in the document.
field_name: String,
},
/// Outputs the total size of all the docid-word-positions keys and values.
TotalDocidWordPositionsSize,
@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
FacetValuesDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, field_name)
},
FacetStats { field_name } => facet_stats(&index, &rtxn, field_name),
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => {
@ -225,46 +232,140 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
Ok(wtr.flush()?)
}
/// Helper function that converts the facet value key to a unique type
/// that can be used to log or display purposes.
fn facet_values_iter<'txn, DC: 'txn, T>(
rtxn: &'txn heed::RoTxn,
db: heed::Database<heed::types::ByteSlice, DC>,
field_id: u8,
facet_type: crate::facet::FacetType,
string_fn: impl Fn(&str) -> T + 'txn,
float_fn: impl Fn(u8, f64, f64) -> T + 'txn,
integer_fn: impl Fn(u8, i64, i64) -> T + 'txn,
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>>
where
DC: heed::BytesDecode<'txn>,
{
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec,
};
let iter = db.prefix_iter(&rtxn, &[field_id])?;
match facet_type {
FacetType::String => {
let iter = iter.remap_key_type::<FacetValueStringCodec>()
.map(move |r| r.map(|((_, key), value)| (string_fn(key), value)));
Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>)
},
FacetType::Float => {
let iter = iter.remap_key_type::<FacetLevelValueF64Codec>()
.map(move |r| r.map(|((_, level, left, right), value)| {
(float_fn(level, left, right), value)
}));
Ok(Box::new(iter))
},
FacetType::Integer => {
let iter = iter.remap_key_type::<FacetLevelValueI64Codec>()
.map(move |r| r.map(|((_, level, left, right), value)| {
(integer_fn(level, left, right), value)
}));
Ok(Box::new(iter))
},
}
}
fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> String {
if level == 0 {
format!("{:?} (level {})", left, level)
} else {
format!("{:?} to {:?} (level {})", left, right, level)
}
}
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use heed::types::{Str, ByteSlice};
use crate::heed_codec::BEU32StrCodec;
let Index {
env: _env,
main,
word_docids,
docid_word_positions,
word_pair_proximity_docids,
facet_field_id_value_docids,
documents,
} = index;
let main_name = "main";
let word_docids_name = "word_docids";
let docid_word_positions_name = "docid_word_positions";
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
let documents_name = "documents";
let mut heap = BinaryHeap::with_capacity(limit + 1);
if limit > 0 {
let words_fst = index.words_fst(rtxn)?;
heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name)));
if heap.len() > limit { heap.pop(); }
if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? {
heap.push(Reverse((documents.len(), format!("documents"), main_name)));
if heap.len() > limit { heap.pop(); }
}
if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
if heap.len() > limit { heap.pop(); }
}
for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
if heap.len() > limit { heap.pop(); }
}
for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? {
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((docid, word), value) = result?;
let key = format!("{} {}", docid, word);
heap.push(Reverse((value.len(), key, docid_word_positions_name)));
if heap.len() > limit { heap.pop(); }
}
for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word1, word2, prox), value) = result?;
let key = format!("{} {} {}", word1, word2, prox);
heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name)));
if heap.len() > limit { heap.pop(); }
}
let faceted_fields = index.faceted_fields(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields {
let facet_name = fields_ids_map.name(field_id).unwrap();
let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>();
let iter = facet_values_iter(
rtxn,
db,
field_id,
field_type,
|key| key.to_owned(),
facet_number_value_to_string,
facet_number_value_to_string,
)?;
for result in iter {
let (fvalue, value) = result?;
let key = format!("{} {}", facet_name, fvalue);
heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name)));
if heap.len() > limit { heap.pop(); }
}
}
for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (id, value) = result?;
heap.push(Reverse((value.len(), id.to_string(), documents_name)));
if heap.len() > limit { heap.pop(); }
}
}
let stdout = io::stdout();
@ -298,10 +399,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin
}
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec};
use heed::{BytesDecode, Error::Decoding};
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields(&rtxn)?;
@ -310,51 +407,78 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?;
let iter = match field_type {
FacetType::String => {
let iter = iter
.map(|result| result.and_then(|(key, value)| {
let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?;
Ok((key.to_string(), value))
}));
Box::new(iter) as Box<dyn Iterator<Item=_>>
},
FacetType::Float => {
let iter = iter
.map(|result| result.and_then(|(key, value)| {
let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?;
Ok((key.to_string(), value))
}));
Box::new(iter)
},
FacetType::Integer => {
let iter = iter
.map(|result| result.and_then(|(key, value)| {
let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?;
Ok((key.to_string(), value))
}));
Box::new(iter)
},
};
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["facet_value", "documents_ids"])?;
wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?;
let db = index.facet_field_id_value_docids;
let iter = facet_values_iter(
rtxn,
db,
field_id,
*field_type,
|key| key.to_owned(),
facet_number_value_to_string,
facet_number_value_to_string,
)?;
for result in iter {
let (value, docids) = result?;
let count = docids.len();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[value, docids])?;
wtr.write_record(&[value, count.to_string(), docids])?;
}
Ok(wtr.flush()?)
}
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
let db = index.facet_field_id_value_docids;
let iter = facet_values_iter(
rtxn,
db,
field_id,
*field_type,
|_key| 0u8,
|level, _left, _right| level,
|level, _left, _right| level,
)?;
println!("The database {:?} facet stats", field_name);
let mut level_size = 0;
let mut current_level = None;
for result in iter {
let (level, _) = result?;
if let Some(current) = current_level {
if current != level {
println!("\tnumber of groups at level {}: {}", current, level_size);
level_size = 0;
}
}
current_level = Some(level);
level_size += 1;
}
if let Some(current) = current_level {
println!("\tnumber of groups at level {}: {}", current, level_size);
}
Ok(())
}
fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> {
use std::fs::File;
use std::io::Write as _;

View File

@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
// We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
// We clean all the faceted documents ids.
for (field_id, _) in faceted_fields {
self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?;
}
// Clear the other databases.
word_docids.clear(self.wtxn)?;
docid_word_positions.clear(self.wtxn)?;

View File

@ -1,4 +1,5 @@
use fst::IntoStreamer;
use heed::types::ByteSlice;
use roaring::RoaringBitmap;
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
if let Some((key, mut docids)) = iter.next().transpose()? {
if key == word.as_ref() {
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
*must_remove = true;
} else {
} else if docids.len() != previous_len {
iter.put_current(key, &docids)?;
}
}
@ -168,27 +170,37 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We delete the documents ids that are under the pairs of words,
// it is faster and use no memory to iterate over all the words pairs than
// to compute the cartesian product of every words of the deleted documents.
let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?;
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let ((w1, w2, prox), mut docids) = result?;
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else {
iter.put_current(&(w1, w2, prox), &docids)?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
drop(iter);
// Remove the documents ids from the faceted documents ids.
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
for (field_id, _) in faceted_fields {
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
}
// We delete the documents ids that are under the facet field id values.
let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else {
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}

256
src/update/facets.rs Normal file
View File

@ -0,0 +1,256 @@
use std::cmp;
use std::fs::File;
use std::num::NonZeroUsize;
use grenad::{CompressionType, Reader, Writer, FileFuse};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error};
use log::debug;
use num_traits::{Bounded, Zero};
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec};
use crate::Index;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
pub struct Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
}
impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> {
Facets {
wtxn,
index,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
level_group_size: NonZeroUsize::new(4).unwrap(),
min_level_size: NonZeroUsize::new(5).unwrap(),
}
}
pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
self
}
pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
self.min_level_size = value;
self
}
pub fn execute(self) -> anyhow::Result<()> {
// We get the faceted fields to be able to create the facet levels.
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for (field_id, facet_type) in faceted_fields {
let (content, documents_ids) = match facet_type {
FacetType::Integer => {
clear_field_levels::<i64, FacetLevelValueI64Codec>(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
let documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
let content = compute_facet_levels::<i64, FacetLevelValueI64Codec>(
self.wtxn,
self.index.facet_field_id_value_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.level_group_size,
self.min_level_size,
field_id,
)?;
(Some(content), documents_ids)
},
FacetType::Float => {
clear_field_levels::<f64, FacetLevelValueF64Codec>(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
let documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
let content = compute_facet_levels::<f64, FacetLevelValueF64Codec>(
self.wtxn,
self.index.facet_field_id_value_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.level_group_size,
self.min_level_size,
field_id,
)?;
(Some(content), documents_ids)
},
FacetType::String => {
let documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
(None, documents_ids)
},
};
if let Some(content) = content {
write_into_lmdb_database(
self.wtxn,
*self.index.facet_field_id_value_docids.as_polymorph(),
content,
|_, _| anyhow::bail!("invalid facet level merging"),
WriteMethod::GetMergePut,
)?;
}
self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?;
}
Ok(())
}
}
fn clear_field_levels<'t, T: 't, KC>(
wtxn: &'t mut heed::RwTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
) -> heed::Result<()>
where
T: Copy + Bounded,
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
{
let left = (field_id, 1, T::min_value(), T::min_value());
let right = (field_id, u8::MAX, T::max_value(), T::max_value());
let range = left..=right;
db.remap_key_type::<KC>().delete_range(wtxn, &range).map(drop)
}
fn compute_facet_levels<'t, T: 't, KC>(
rtxn: &'t heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
compression_type: CompressionType,
compression_level: Option<u32>,
shrink_size: Option<u64>,
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
field_id: u8,
) -> anyhow::Result<Reader<FileFuse>>
where
T: Copy + PartialEq + PartialOrd + Bounded + Zero,
KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>,
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
{
let first_level_size = db.prefix_iter(rtxn, &[field_id])?
.remap_types::<DecodeIgnore, DecodeIgnore>()
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
let level_0_range = {
let left = (field_id, 0, T::min_value(), T::min_value());
let right = (field_id, 0, T::max_value(), T::max_value());
left..=right
};
// Groups sizes are always a power of the original level_group_size and therefore a group
// always maps groups of the previous level and never splits previous levels groups in half.
let group_size_iter = (1u8..)
.map(|l| (l, level_group_size.get().pow(l as u32)))
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
for (level, group_size) in group_size_iter {
let mut left = T::zero();
let mut right = T::zero();
let mut group_docids = RoaringBitmap::new();
let db = db.remap_key_type::<KC>();
for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
let ((_field_id, _level, value, _right), docids) = result?;
if i == 0 {
left = value;
} else if i % group_size == 0 {
// we found the first bound of the next group, we must store the left
// and right bounds associated with the docids.
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
// We save the left bound for the new group and also reset the docids.
group_docids = RoaringBitmap::new();
left = value;
}
// The right bound is always the bound we run through.
group_docids.union_with(&docids);
right = value;
}
if !group_docids.is_empty() {
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
}
}
writer_into_reader(writer, shrink_size)
}
fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
) -> anyhow::Result<RoaringBitmap>
{
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? {
let (_key, docids) = result?;
documents_ids.union_with(&docids);
}
Ok(documents_ids)
}
fn write_entry<T, KC>(
writer: &mut Writer<File>,
field_id: u8,
level: u8,
left: T,
right: T,
ids: &RoaringBitmap,
) -> anyhow::Result<()>
where
KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
{
let key = (field_id, level, left, right);
let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
writer.insert(&key, &data)?;
Ok(())
}

View File

@ -2,6 +2,7 @@ use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom};
use std::num::NonZeroUsize;
use std::sync::mpsc::sync_channel;
use std::time::Instant;
@ -15,7 +16,7 @@ use rayon::prelude::*;
use rayon::ThreadPool;
use crate::index::Index;
use crate::update::UpdateIndexingStep;
use crate::update::{Facets, UpdateIndexingStep};
use self::store::{Store, Readers};
use self::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
@ -31,12 +32,12 @@ mod store;
mod transform;
#[derive(Debug, Copy, Clone)]
enum WriteMethod {
pub enum WriteMethod {
Append,
GetMergePut,
}
fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
let mut builder = Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
@ -45,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Re
builder.build(file)
}
fn create_sorter(
pub fn create_sorter(
merge: MergeFn,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
@ -71,7 +72,7 @@ fn create_sorter(
builder.build()
}
fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> {
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> {
let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?;
let file = if let Some(shrink_size) = shrink_size {
@ -82,13 +83,13 @@ fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow:
Reader::new(file).map_err(Into::into)
}
fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> {
pub fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> {
let mut builder = Merger::builder(merge);
builder.extend(sources);
builder.build()
}
fn merge_into_lmdb_database(
pub fn merge_into_lmdb_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
sources: Vec<Reader<FileFuse>>,
@ -132,7 +133,7 @@ fn merge_into_lmdb_database(
Ok(())
}
fn write_into_lmdb_database(
pub fn write_into_lmdb_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
mut reader: Reader<FileFuse>,
@ -157,7 +158,7 @@ fn write_into_lmdb_database(
match iter.next().transpose()? {
Some((key, old_val)) if key == k => {
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
let val = merge(k, &vals).expect("merge failed");
let val = merge(k, &vals)?;
iter.put_current(k, &val)?;
},
_ => {
@ -207,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) thread_pool: Option<&'a ThreadPool>,
facet_level_group_size: Option<NonZeroUsize>,
facet_min_level_size: Option<NonZeroUsize>,
update_method: IndexDocumentsMethod,
update_format: UpdateFormat,
autogenerate_docids: bool,
@ -225,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
thread_pool: None,
facet_level_group_size: None,
facet_min_level_size: None,
update_method: IndexDocumentsMethod::ReplaceDocuments,
update_format: UpdateFormat::Json,
autogenerate_docids: true,
@ -308,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
thread_pool: self.thread_pool,
};
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
debug!("documents to delete {:?}", replaced_documents_ids);
deletion_builder.delete_documents(&replaced_documents_ids);
let _deleted_documents_count = deletion_builder.execute()?;
let deleted_documents_count = deletion_builder.execute()?;
debug!("{} documents actually deleted", deleted_documents_count);
}
let mmap;
@ -327,7 +334,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
enum DatabaseType {
Main,
WordDocids,
FacetValuesDocids,
FacetLevel0ValuesDocids,
}
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
@ -427,7 +434,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
(DatabaseType::Main, main_readers, main_merge as MergeFn),
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
(
DatabaseType::FacetValuesDocids,
DatabaseType::FacetLevel0ValuesDocids,
facet_field_value_docids_readers,
facet_field_value_docids_merge,
),
@ -557,7 +564,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method,
)?;
},
DatabaseType::FacetValuesDocids => {
DatabaseType::FacetLevel0ValuesDocids => {
debug!("Writing the facet values docids into LMDB on disk...");
let db = *self.index.facet_field_id_value_docids.as_polymorph();
write_into_lmdb_database(
@ -577,6 +584,18 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
});
}
let mut builder = Facets::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
if let Some(value) = self.facet_level_group_size {
builder.level_group_size(value);
}
if let Some(value) = self.facet_min_level_size {
builder.min_level_size(value);
}
builder.execute()?;
debug_assert_eq!(database_count, total_databases);
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());

View File

@ -19,7 +19,7 @@ use tempfile::tempfile;
use crate::facet::FacetType;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::tokenizer::{simple_tokenizer, only_token};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId};
@ -337,8 +337,8 @@ impl Store {
for ((field_id, value), docids) in iter {
let result = match value {
String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned),
Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned),
Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned),
Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned),
Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned),
};
let key = result.context("could not serialize facet key")?;
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
@ -399,7 +399,7 @@ impl Store {
// We skip documents that must not be indexed by this thread.
if count % num_threads == thread_index {
// This is a log routine that we do every `log_every_n` documents.
if log_every_n.map_or(false, |len| count % len == 0) {
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec
Value::Null => Ok(()),
Value::Bool(b) => Ok(output.push(Integer(*b as i64))),
Value::Number(number) => match ftype {
FacetType::String => bail!("invalid facet type, expecting {} found number", ftype),
FacetType::String => {
let string = SmallString32::from(number.to_string());
Ok(output.push(String(string)))
},
FacetType::Float => match number.as_f64() {
Some(float) => Ok(output.push(Float(OrderedFloat(float)))),
None => bail!("invalid facet type, expecting {} found integer", ftype),
@ -586,7 +589,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec
},
},
Value::String(string) => {
let string = string.trim();
let string = string.trim().to_lowercase();
if string.is_empty() { return Ok(()) }
match ftype {
FacetType::String => {

View File

@ -1,6 +1,7 @@
mod available_documents_ids;
mod clear_documents;
mod delete_documents;
mod facets;
mod index_documents;
mod settings;
mod update_builder;
@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
pub use self::facets::Facets;
pub use self::settings::Settings;
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;

View File

@ -412,6 +412,23 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let fields_ids = index.faceted_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer });
// Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count();
assert_eq!(count, 3);
drop(rtxn);
// Index a little more documents with new and current facets values.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count();
assert_eq!(count, 4);
drop(rtxn);
}
}

View File

@ -2,7 +2,7 @@ use grenad::CompressionType;
use rayon::ThreadPool;
use crate::Index;
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings};
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
pub struct UpdateBuilder<'a> {
pub(crate) log_every_n: Option<usize>,
@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> {
builder
}
pub fn facets<'t, 'u, 'i>(
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Facets<'t, 'u, 'i>
{
let mut builder = Facets::new(wtxn, index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
builder
}
}