mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-25 06:00:08 +01:00
Merge pull request #66 from meilisearch/show-available-facets
Expose an API to compute facets distribution
This commit is contained in:
commit
fa0cc2dc13
4
Cargo.lock
generated
4
Cargo.lock
generated
@ -1211,9 +1211,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237"
|
||||
|
||||
[[package]]
|
||||
name = "roaring"
|
||||
version = "0.6.3"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f12bdbc3b9b2fd12148ee9f97f9e36438f1e84d3ce47fec0ad6b4bfbb62b3a35"
|
||||
checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
@ -31,7 +31,7 @@ ordered-float = "2.0.0"
|
||||
rayon = "1.3.1"
|
||||
regex = "1.4.2"
|
||||
ringtail = "0.3.0"
|
||||
roaring = "0.6.1"
|
||||
roaring = "0.6.4"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = { version = "1.0.59", features = ["preserve_order"] }
|
||||
slice-group-by = "0.2.6"
|
||||
|
627
http-ui/Cargo.lock
generated
627
http-ui/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -22,6 +22,7 @@ tempfile = "3.1.0"
|
||||
askama = "0.10.1"
|
||||
askama_warp = "0.10.0"
|
||||
bytes = "0.5.6"
|
||||
either = "1.6.1"
|
||||
flate2 = "1.0.19"
|
||||
futures = "0.3.6"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
@ -1,9 +1,9 @@
|
||||
var request = null;
|
||||
var timeoutID = null;
|
||||
|
||||
$('#query, #facet').on('input', function () {
|
||||
$('#query, #filters').on('input', function () {
|
||||
var query = $('#query').val();
|
||||
var facet = $('#facet').val();
|
||||
var filters = $('#filters').val();
|
||||
var timeoutMs = 100;
|
||||
|
||||
if (timeoutID !== null) {
|
||||
@ -15,18 +15,35 @@ $('#query, #facet').on('input', function () {
|
||||
type: "POST",
|
||||
url: "query",
|
||||
contentType: 'application/json',
|
||||
data: JSON.stringify({ 'query': query, 'facetCondition': facet }),
|
||||
data: JSON.stringify({
|
||||
'query': query,
|
||||
'filters': filters,
|
||||
"facetDistribution": true,
|
||||
}),
|
||||
contentType: 'application/json',
|
||||
success: function (data, textStatus, request) {
|
||||
results.innerHTML = '';
|
||||
facets.innerHTML = '';
|
||||
|
||||
let timeSpent = request.getResponseHeader('Time-Ms');
|
||||
let numberOfDocuments = data.length;
|
||||
count.innerHTML = `${numberOfDocuments}`;
|
||||
let numberOfDocuments = data.documents.length;
|
||||
count.innerHTML = data.numberOfCandidates.toLocaleString();
|
||||
time.innerHTML = `${timeSpent}ms`;
|
||||
time.classList.remove('fade-in-out');
|
||||
|
||||
for (element of data) {
|
||||
for (facet_name in data.facets) {
|
||||
for (value in data.facets[facet_name]) {
|
||||
const elem = document.createElement('span');
|
||||
const count = data.facets[facet_name][value];
|
||||
elem.classList.add("tag");
|
||||
elem.setAttribute('data-name', facet_name);
|
||||
elem.setAttribute('data-value', value);
|
||||
elem.innerHTML = `${facet_name}:${value} (${count})`;
|
||||
facets.appendChild(elem);
|
||||
}
|
||||
}
|
||||
|
||||
for (element of data.documents) {
|
||||
const elem = document.createElement('li');
|
||||
elem.classList.add("document");
|
||||
|
||||
@ -54,6 +71,19 @@ $('#query, #facet').on('input', function () {
|
||||
results.appendChild(elem);
|
||||
}
|
||||
|
||||
// When we click on a tag we append the facet value
|
||||
// at the end of the facet query.
|
||||
$('#facets .tag').on('click', function () {
|
||||
let name = $(this).attr("data-name");
|
||||
let value = $(this).attr("data-value");
|
||||
|
||||
let facet_query = $('#filters').val().trim();
|
||||
if (facet_query === "") {
|
||||
$('#filters').val(`${name} = "${value}"`).trigger('input');
|
||||
} else {
|
||||
$('#filters').val(`${facet_query} AND ${name} = "${value}"`).trigger('input');
|
||||
}
|
||||
});
|
||||
},
|
||||
beforeSend: function () {
|
||||
if (request !== null) {
|
||||
@ -65,6 +95,25 @@ $('#query, #facet').on('input', function () {
|
||||
}, timeoutMs);
|
||||
});
|
||||
|
||||
function diffArray(arr1, arr2) {
|
||||
return arr1.concat(arr2).filter(function (val) {
|
||||
if (!(arr1.includes(val) && arr2.includes(val)))
|
||||
return val;
|
||||
});
|
||||
}
|
||||
|
||||
function selectedFacetsToArray(facets_obj) {
|
||||
var array = [];
|
||||
for (const facet_name in facets_obj) {
|
||||
var subarray = [];
|
||||
for (const facet_value of facets_obj[facet_name]) {
|
||||
subarray.push(`${facet_name}:${facet_value}`);
|
||||
}
|
||||
array.push(subarray);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
// Make the number of document a little bit prettier
|
||||
$('#docs-count').text(function(index, text) {
|
||||
return parseInt(text).toLocaleString()
|
||||
@ -75,8 +124,8 @@ $('#db-size').text(function(index, text) {
|
||||
return filesize(parseInt(text))
|
||||
});
|
||||
|
||||
// We trigger the input when we load the script, this way
|
||||
// we execute a placeholder search when the input is empty.
|
||||
// We trigger the input when we load the script.
|
||||
$(window).on('load', function () {
|
||||
// We execute a placeholder search when the input is empty.
|
||||
$('#query').trigger('input');
|
||||
});
|
||||
|
@ -4,6 +4,23 @@
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
#facets .tag {
|
||||
margin-right: 1em;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
#facets {
|
||||
max-width: 900px;
|
||||
margin: 20px auto 0 auto;
|
||||
padding: 0;
|
||||
max-height: 16em;
|
||||
overflow: scroll;
|
||||
}
|
||||
|
||||
#facets .tag:hover {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
#logo-white {
|
||||
display: none;
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::fs::{File, create_dir_all};
|
||||
use std::net::SocketAddr;
|
||||
@ -11,6 +11,7 @@ use std::{mem, io};
|
||||
|
||||
use askama_warp::Template;
|
||||
use byte_unit::Byte;
|
||||
use either::Either;
|
||||
use flate2::read::GzDecoder;
|
||||
use futures::stream;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
@ -28,6 +29,7 @@ use warp::filters::ws::Message;
|
||||
use warp::{Filter, http::Response};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
|
||||
use milli::facet::FacetValue;
|
||||
use milli::update::UpdateIndexingStep::*;
|
||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
|
||||
@ -620,12 +622,38 @@ async fn main() -> anyhow::Result<()> {
|
||||
.body(include_str!("../public/logo-black.svg"))
|
||||
);
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum UntaggedEither<L, R> {
|
||||
Left(L),
|
||||
Right(R),
|
||||
}
|
||||
|
||||
impl<L, R> From<UntaggedEither<L, R>> for Either<L, R> {
|
||||
fn from(value: UntaggedEither<L, R>) -> Either<L, R> {
|
||||
match value {
|
||||
UntaggedEither::Left(left) => Either::Left(left),
|
||||
UntaggedEither::Right(right) => Either::Right(right),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct QueryBody {
|
||||
query: Option<String>,
|
||||
facet_condition: Option<String>,
|
||||
filters: Option<String>,
|
||||
facet_filters: Option<Vec<UntaggedEither<Vec<String>, String>>>,
|
||||
facet_distribution: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct Answer {
|
||||
documents: Vec<Map<String, Value>>,
|
||||
number_of_candidates: u64,
|
||||
facets: BTreeMap<String, BTreeMap<FacetValue, u64>>,
|
||||
}
|
||||
|
||||
let disable_highlighting = opt.disable_highlighting;
|
||||
@ -642,14 +670,42 @@ async fn main() -> anyhow::Result<()> {
|
||||
if let Some(query) = query.query {
|
||||
search.query(query);
|
||||
}
|
||||
if let Some(condition) = query.facet_condition {
|
||||
if !condition.trim().is_empty() {
|
||||
let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap();
|
||||
search.facet_condition(condition);
|
||||
}
|
||||
|
||||
let filters = match query.filters {
|
||||
Some(condition) if !condition.trim().is_empty() => {
|
||||
Some(FacetCondition::from_str(&rtxn, &index, &condition).unwrap())
|
||||
},
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
let facet_filters = match query.facet_filters {
|
||||
Some(array) => {
|
||||
let eithers = array.into_iter().map(Into::into);
|
||||
FacetCondition::from_array(&rtxn, &index, eithers).unwrap()
|
||||
},
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
let condition = match (filters, facet_filters) {
|
||||
(Some(filters), Some(facet_filters)) => {
|
||||
Some(FacetCondition::And(Box::new(filters), Box::new(facet_filters)))
|
||||
},
|
||||
(Some(condition), None) | (None, Some(condition)) => Some(condition),
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
if let Some(condition) = condition {
|
||||
search.facet_condition(condition);
|
||||
}
|
||||
|
||||
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
||||
let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap();
|
||||
|
||||
let number_of_candidates = candidates.len();
|
||||
let facets = if query.facet_distribution == Some(true) {
|
||||
Some(index.facets_distribution(&rtxn).candidates(candidates).execute().unwrap())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut documents = Vec::new();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
@ -674,10 +730,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
documents.push(object);
|
||||
}
|
||||
|
||||
let answer = Answer {
|
||||
documents,
|
||||
number_of_candidates,
|
||||
facets: facets.unwrap_or_default(),
|
||||
};
|
||||
|
||||
Response::builder()
|
||||
.header("Content-Type", "application/json")
|
||||
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
|
||||
.body(serde_json::to_string(&documents).unwrap())
|
||||
.body(serde_json::to_string(&answer).unwrap())
|
||||
});
|
||||
|
||||
let index_cloned = index.clone();
|
||||
|
@ -56,7 +56,7 @@
|
||||
<div class="level-item">
|
||||
<div class="field has-addons has-addons-right">
|
||||
<input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney">
|
||||
<input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800">
|
||||
<input id="filters" class="input" type="text" placeholder="filters like released >= 1577836800">
|
||||
</div>
|
||||
</div>
|
||||
<div class="level-item"></div>
|
||||
@ -66,7 +66,7 @@
|
||||
<nav class="level-right">
|
||||
<div class="level-item has-text-centered">
|
||||
<div>
|
||||
<p class="heading">Documents</p>
|
||||
<p class="heading">Candidates</p>
|
||||
<p id="count" class="title">0</p>
|
||||
</div>
|
||||
</div>
|
||||
@ -84,6 +84,10 @@
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="facets">
|
||||
<!-- facet values -->
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<ol id="results" class="content">
|
||||
<!-- documents matching requests -->
|
||||
|
60
src/facet/facet_value.rs
Normal file
60
src/facet/facet_value.rs
Normal file
@ -0,0 +1,60 @@
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub enum FacetValue {
|
||||
String(String),
|
||||
Float(OrderedFloat<f64>),
|
||||
Integer(i64),
|
||||
}
|
||||
|
||||
impl From<String> for FacetValue {
|
||||
fn from(string: String) -> FacetValue {
|
||||
FacetValue::String(string)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for FacetValue {
|
||||
fn from(string: &str) -> FacetValue {
|
||||
FacetValue::String(string.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f64> for FacetValue {
|
||||
fn from(float: f64) -> FacetValue {
|
||||
FacetValue::Float(OrderedFloat(float))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OrderedFloat<f64>> for FacetValue {
|
||||
fn from(float: OrderedFloat<f64>) -> FacetValue {
|
||||
FacetValue::Float(float)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<i64> for FacetValue {
|
||||
fn from(integer: i64) -> FacetValue {
|
||||
FacetValue::Integer(integer)
|
||||
}
|
||||
}
|
||||
|
||||
/// We implement Serialize ourselves because we need to always serialize it as a string,
|
||||
/// JSON object keys must be strings not numbers.
|
||||
impl Serialize for FacetValue {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match self {
|
||||
FacetValue::String(string) => serializer.serialize_str(string),
|
||||
FacetValue::Float(float) => {
|
||||
let string = float.to_string();
|
||||
serializer.serialize_str(&string)
|
||||
},
|
||||
FacetValue::Integer(integer) => {
|
||||
let string = integer.to_string();
|
||||
serializer.serialize_str(&string)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,6 @@
|
||||
mod facet_type;
|
||||
mod facet_value;
|
||||
pub mod value_encoding;
|
||||
|
||||
pub use self::facet_type::FacetType;
|
||||
pub use self::facet_value::FacetValue;
|
||||
|
@ -9,7 +9,7 @@ use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::{default_criteria, Criterion, Search};
|
||||
use crate::{default_criteria, Criterion, Search, FacetDistribution};
|
||||
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
|
||||
use crate::{
|
||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
||||
@ -351,6 +351,10 @@ impl Index {
|
||||
Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?)
|
||||
}
|
||||
|
||||
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
|
||||
FacetDistribution::new(rtxn, self)
|
||||
}
|
||||
|
||||
pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> {
|
||||
Search::new(rtxn, self)
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetCondition, SearchResult};
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
|
@ -3,6 +3,8 @@ use std::fmt::Debug;
|
||||
use std::ops::Bound::{self, Included, Excluded};
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use either::Either;
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use log::debug;
|
||||
use num_traits::Bounded;
|
||||
@ -141,6 +143,85 @@ where T: FromStr,
|
||||
}
|
||||
|
||||
impl FacetCondition {
|
||||
pub fn from_array<I, J, A, B>(
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
array: I,
|
||||
) -> anyhow::Result<Option<FacetCondition>>
|
||||
where I: IntoIterator<Item=Either<J, B>>,
|
||||
J: IntoIterator<Item=A>,
|
||||
A: AsRef<str>,
|
||||
B: AsRef<str>,
|
||||
{
|
||||
fn facet_condition(
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
faceted_fields: &HashMap<String, FacetType>,
|
||||
key: &str,
|
||||
value: &str,
|
||||
) -> anyhow::Result<FacetCondition>
|
||||
{
|
||||
let fid = fields_ids_map.id(key).with_context(|| {
|
||||
format!("{:?} isn't present in the fields ids map", key)
|
||||
})?;
|
||||
let ftype = faceted_fields.get(key).copied().with_context(|| {
|
||||
format!("{:?} isn't a faceted field", key)
|
||||
})?;
|
||||
let (neg, value) = match value.trim().strip_prefix('-') {
|
||||
Some(value) => (true, value.trim()),
|
||||
None => (false, value.trim()),
|
||||
};
|
||||
|
||||
let operator = match ftype {
|
||||
FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)),
|
||||
FacetType::Float => OperatorF64(fid, FacetNumberOperator::Equal(value.parse()?)),
|
||||
FacetType::Integer => OperatorI64(fid, FacetNumberOperator::Equal(value.parse()?)),
|
||||
};
|
||||
|
||||
if neg { Ok(operator.negate()) } else { Ok(operator) }
|
||||
}
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let faceted_fields = index.faceted_fields(rtxn)?;
|
||||
let mut ands = None;
|
||||
|
||||
for either in array {
|
||||
match either {
|
||||
Either::Left(array) => {
|
||||
let mut ors = None;
|
||||
for rule in array {
|
||||
let mut iter = rule.as_ref().splitn(2, ':');
|
||||
let key = iter.next().context("missing facet condition key")?;
|
||||
let value = iter.next().context("missing facet condition value")?;
|
||||
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
|
||||
ors = match ors.take() {
|
||||
Some(ors) => Some(Or(Box::new(ors), Box::new(condition))),
|
||||
None => Some(condition),
|
||||
};
|
||||
}
|
||||
|
||||
if let Some(rule) = ors {
|
||||
ands = match ands.take() {
|
||||
Some(ands) => Some(And(Box::new(ands), Box::new(rule))),
|
||||
None => Some(rule),
|
||||
};
|
||||
}
|
||||
},
|
||||
Either::Right(rule) => {
|
||||
let mut iter = rule.as_ref().splitn(2, ':');
|
||||
let key = iter.next().context("missing facet condition key")?;
|
||||
let value = iter.next().context("missing facet condition value")?;
|
||||
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
|
||||
ands = match ands.take() {
|
||||
Some(ands) => Some(And(Box::new(ands), Box::new(condition))),
|
||||
None => Some(condition),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ands)
|
||||
}
|
||||
|
||||
pub fn from_str(
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
@ -641,4 +722,35 @@ mod tests {
|
||||
);
|
||||
assert_eq!(condition, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_array() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
// Set the faceted fields to be the channel.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
|
||||
builder.set_faceted_fields(hashmap!{
|
||||
"channel".into() => "string".into(),
|
||||
"timestamp".into() => "integer".into(),
|
||||
});
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Test that the facet condition is correctly generated.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let condition = FacetCondition::from_array(
|
||||
&rtxn, &index,
|
||||
vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])],
|
||||
).unwrap().unwrap();
|
||||
let expected = FacetCondition::from_str(
|
||||
&rtxn, &index,
|
||||
"channel = gotaga AND (timestamp = 44 OR channel != ponce)",
|
||||
).unwrap();
|
||||
assert_eq!(condition, expected);
|
||||
}
|
||||
}
|
||||
|
260
src/search/facet/facet_distribution.rs
Normal file
260
src/search/facet/facet_distribution.rs
Normal file
@ -0,0 +1,260 @@
|
||||
use std::collections::{HashSet, BTreeMap};
|
||||
use std::ops::Bound::Unbounded;
|
||||
use std::{cmp, fmt};
|
||||
|
||||
use anyhow::Context;
|
||||
use heed::BytesDecode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::{FacetType, FacetValue};
|
||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||
use crate::search::facet::{FacetIter, FacetRange};
|
||||
use crate::{Index, FieldId, DocumentId};
|
||||
|
||||
/// The default number of values by facets that will
|
||||
/// be fetched from the key-value store.
|
||||
const DEFAULT_VALUES_BY_FACET: usize = 100;
|
||||
|
||||
/// The hard limit in the number of values by facets that will be fetched from
|
||||
/// the key-value store. Searching for more values could slow down the engine.
|
||||
const MAX_VALUES_BY_FACET: usize = 1000;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 1000;
|
||||
|
||||
pub struct FacetDistribution<'a> {
|
||||
facets: Option<HashSet<String>>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
max_values_by_facet: usize,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
}
|
||||
|
||||
impl<'a> FacetDistribution<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> {
|
||||
FacetDistribution {
|
||||
facets: None,
|
||||
candidates: None,
|
||||
max_values_by_facet: DEFAULT_VALUES_BY_FACET,
|
||||
rtxn,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
|
||||
self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
|
||||
self.candidates = Some(candidates);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET);
|
||||
self
|
||||
}
|
||||
|
||||
/// There is a small amount of candidates OR we ask for facet string values so we
|
||||
/// decide to iterate over the facet values of each one of them, one by one.
|
||||
fn facet_values_from_documents(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
candidates: &RoaringBitmap,
|
||||
) -> heed::Result<BTreeMap<FacetValue, u64>>
|
||||
{
|
||||
fn fetch_facet_values<'t, KC, K: 't>(
|
||||
index: &Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
) -> heed::Result<BTreeMap<FacetValue, u64>>
|
||||
where
|
||||
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
|
||||
K: Into<FacetValue>,
|
||||
{
|
||||
let mut facet_values = BTreeMap::new();
|
||||
let mut key_buffer = vec![field_id];
|
||||
|
||||
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
|
||||
key_buffer.truncate(1);
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = index.field_id_docid_facet_values
|
||||
.prefix_iter(rtxn, &key_buffer)?
|
||||
.remap_key_type::<KC>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, value), ()) = result?;
|
||||
*facet_values.entry(value.into()).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(facet_values)
|
||||
}
|
||||
|
||||
let index = self.index;
|
||||
let rtxn = self.rtxn;
|
||||
match facet_type {
|
||||
FacetType::String => {
|
||||
fetch_facet_values::<FieldDocIdFacetStringCodec, _>(index, rtxn, field_id, candidates)
|
||||
},
|
||||
FacetType::Float => {
|
||||
fetch_facet_values::<FieldDocIdFacetF64Codec, _>(index, rtxn, field_id, candidates)
|
||||
},
|
||||
FacetType::Integer => {
|
||||
fetch_facet_values::<FieldDocIdFacetI64Codec, _>(index, rtxn, field_id, candidates)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// There is too much documents, we use the facet levels to move throught
|
||||
/// the facet values, to find the candidates and values associated.
|
||||
fn facet_values_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
candidates: &RoaringBitmap,
|
||||
) -> heed::Result<BTreeMap<FacetValue, u64>>
|
||||
{
|
||||
let iter = match facet_type {
|
||||
FacetType::String => unreachable!(),
|
||||
FacetType::Float => {
|
||||
let iter = FacetIter::<f64, FacetLevelValueF64Codec>::new_non_reducing(
|
||||
self.rtxn, self.index, field_id, candidates.clone(),
|
||||
)?;
|
||||
let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)));
|
||||
Box::new(iter) as Box::<dyn Iterator<Item=_>>
|
||||
},
|
||||
FacetType::Integer => {
|
||||
let iter = FacetIter::<i64, FacetLevelValueI64Codec>::new_non_reducing(
|
||||
self.rtxn, self.index, field_id, candidates.clone(),
|
||||
)?;
|
||||
Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))))
|
||||
},
|
||||
};
|
||||
|
||||
let mut facet_values = BTreeMap::new();
|
||||
for result in iter {
|
||||
let (value, mut docids) = result?;
|
||||
docids.intersect_with(candidates);
|
||||
if !docids.is_empty() {
|
||||
facet_values.insert(value, docids.len());
|
||||
}
|
||||
if facet_values.len() == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(facet_values)
|
||||
}
|
||||
|
||||
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
|
||||
/// facet values one by one and iterate on the facet level 0 for numbers.
|
||||
fn facet_values_from_raw_facet_database(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> heed::Result<BTreeMap<FacetValue, u64>>
|
||||
{
|
||||
let db = self.index.facet_field_id_value_docids;
|
||||
let level = 0;
|
||||
let iter = match facet_type {
|
||||
FacetType::String => {
|
||||
let iter = db
|
||||
.prefix_iter(self.rtxn, &[field_id])?
|
||||
.remap_key_type::<FacetValueStringCodec>()
|
||||
.map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids)));
|
||||
Box::new(iter) as Box::<dyn Iterator<Item=_>>
|
||||
},
|
||||
FacetType::Float => {
|
||||
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let range = FacetRange::<f64, _>::new(
|
||||
self.rtxn, db, field_id, level, Unbounded, Unbounded,
|
||||
)?;
|
||||
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
|
||||
},
|
||||
FacetType::Integer => {
|
||||
let db = db.remap_key_type::<FacetLevelValueI64Codec>();
|
||||
let range = FacetRange::<i64, _>::new(
|
||||
self.rtxn, db, field_id, level, Unbounded, Unbounded,
|
||||
)?;
|
||||
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
|
||||
},
|
||||
};
|
||||
|
||||
let mut facet_values = BTreeMap::new();
|
||||
for result in iter {
|
||||
let (value, docids) = result?;
|
||||
facet_values.insert(value, docids.len());
|
||||
if facet_values.len() == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(facet_values)
|
||||
}
|
||||
|
||||
fn facet_values(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> heed::Result<BTreeMap<FacetValue, u64>>
|
||||
{
|
||||
if let Some(candidates) = self.candidates.as_ref() {
|
||||
// Classic search, candidates were specified, we must return facet values only related
|
||||
// to those candidates. We also enter here for facet strings for performance reasons.
|
||||
if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String {
|
||||
self.facet_values_from_documents(field_id, facet_type, candidates)
|
||||
} else {
|
||||
self.facet_values_from_facet_levels(field_id, facet_type, candidates)
|
||||
}
|
||||
} else {
|
||||
self.facet_values_from_raw_facet_database(field_id, facet_type)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<FacetValue, u64>>> {
|
||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
|
||||
let fields_ids: Vec<_> = match &self.facets {
|
||||
Some(names) => names
|
||||
.iter()
|
||||
.filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t)))
|
||||
.collect(),
|
||||
None => faceted_fields.into_iter().collect(),
|
||||
};
|
||||
|
||||
let mut facets_values = BTreeMap::new();
|
||||
for (name, ftype) in fields_ids {
|
||||
let fid = fields_ids_map.id(&name).with_context(|| {
|
||||
format!("missing field name {:?} from the fields id map", name)
|
||||
})?;
|
||||
let values = self.facet_values(fid, ftype)?;
|
||||
facets_values.insert(name, values);
|
||||
}
|
||||
|
||||
Ok(facets_values)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FacetDistribution<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let FacetDistribution {
|
||||
facets,
|
||||
candidates,
|
||||
max_values_by_facet,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
} = self;
|
||||
|
||||
f.debug_struct("FacetDistribution")
|
||||
.field("facets", facets)
|
||||
.field("candidates", candidates)
|
||||
.field("max_values_by_facet", max_values_by_facet)
|
||||
.finish()
|
||||
}
|
||||
}
|
@ -13,11 +13,13 @@ use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::{Index, FieldId};
|
||||
|
||||
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::facet_distribution::FacetDistribution;
|
||||
|
||||
mod facet_condition;
|
||||
mod facet_distribution;
|
||||
mod parser;
|
||||
|
||||
struct FacetRange<'t, T: 't, KC> {
|
||||
pub struct FacetRange<'t, T: 't, KC> {
|
||||
iter: RoRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<T>,
|
||||
}
|
||||
@ -27,7 +29,7 @@ where
|
||||
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
|
||||
T: PartialOrd + Copy + Bounded,
|
||||
{
|
||||
fn new(
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<KC, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
@ -78,7 +80,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
struct FacetRevRange<'t, T: 't, KC> {
|
||||
pub struct FacetRevRange<'t, T: 't, KC> {
|
||||
iter: RoRevRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<T>,
|
||||
}
|
||||
@ -88,7 +90,7 @@ where
|
||||
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
|
||||
T: PartialOrd + Copy + Bounded,
|
||||
{
|
||||
fn new(
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<KC, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
@ -145,6 +147,7 @@ pub struct FacetIter<'t, T: 't, KC> {
|
||||
db: Database<KC, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level_iters: Vec<(RoaringBitmap, Either<FacetRange<'t, T, KC>, FacetRevRange<'t, T, KC>>)>,
|
||||
must_reduce: bool,
|
||||
}
|
||||
|
||||
impl<'t, T, KC> FacetIter<'t, T, KC>
|
||||
@ -153,7 +156,10 @@ where
|
||||
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
|
||||
T: PartialOrd + Copy + Bounded,
|
||||
{
|
||||
pub fn new(
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
@ -163,10 +169,14 @@ where
|
||||
let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] })
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
pub fn new_reverse(
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries in reverse
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reverse_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
@ -176,7 +186,26 @@ where
|
||||
let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] })
|
||||
let level_iters = vec![(documents_ids, Right(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will not reduce the given documents ids
|
||||
/// while iterating on the different facet levels, possibly returning multiple times
|
||||
/// a document id associated with multiple facet values.
|
||||
pub fn new_non_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t, T, KC>>
|
||||
{
|
||||
let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
|
||||
}
|
||||
|
||||
fn highest_level<X>(rtxn: &'t heed::RoTxn, db: Database<KC, X>, fid: FieldId) -> heed::Result<Option<u8>> {
|
||||
@ -214,7 +243,9 @@ where
|
||||
|
||||
docids.intersect_with(&documents_ids);
|
||||
if !docids.is_empty() {
|
||||
documents_ids.difference_with(&docids);
|
||||
if self.must_reduce {
|
||||
documents_ids.difference_with(&docids);
|
||||
}
|
||||
|
||||
if level == 0 {
|
||||
debug!("found {:?} at {:?}", docids, left);
|
||||
|
@ -20,7 +20,7 @@ use crate::mdfs::Mdfs;
|
||||
use crate::query_tokens::{query_tokens, QueryToken};
|
||||
use crate::{Index, FieldId, DocumentId, Criterion};
|
||||
|
||||
pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::facet::{FacetIter};
|
||||
|
||||
// Building these factories is not free.
|
||||
@ -189,9 +189,9 @@ impl<'a> Search<'a> {
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse
|
||||
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
|
||||
};
|
||||
let mut limit_tmp = limit;
|
||||
let mut output = Vec::new();
|
||||
@ -226,9 +226,9 @@ impl<'a> Search<'a> {
|
||||
}
|
||||
} else {
|
||||
let facet_fn = if ascending {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
|
||||
} else {
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse
|
||||
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
|
||||
};
|
||||
let mut limit_tmp = limit;
|
||||
let mut output = Vec::new();
|
||||
@ -313,22 +313,26 @@ impl<'a> Search<'a> {
|
||||
// there is some facet conditions we return a placeholder.
|
||||
let documents_ids = match order_by_facet {
|
||||
Some((fid, ftype, is_ascending)) => {
|
||||
self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)?
|
||||
self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
|
||||
},
|
||||
None => facet_candidates.iter().take(limit).collect(),
|
||||
};
|
||||
return Ok(SearchResult { documents_ids, ..Default::default() })
|
||||
return Ok(SearchResult {
|
||||
documents_ids,
|
||||
candidates: facet_candidates,
|
||||
..Default::default()
|
||||
})
|
||||
},
|
||||
(None, None) => {
|
||||
// If the query is not set or results in no DFAs we return a placeholder.
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
let all_docids = self.index.documents_ids(self.rtxn)?;
|
||||
let documents_ids = match order_by_facet {
|
||||
Some((fid, ftype, is_ascending)) => {
|
||||
self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)?
|
||||
self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
|
||||
},
|
||||
None => documents_ids.iter().take(limit).collect(),
|
||||
None => all_docids.iter().take(limit).collect(),
|
||||
};
|
||||
return Ok(SearchResult { documents_ids, ..Default::default() })
|
||||
return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
|
||||
},
|
||||
};
|
||||
|
||||
@ -336,7 +340,7 @@ impl<'a> Search<'a> {
|
||||
|
||||
// The mana depth first search is a revised DFS that explore
|
||||
// solutions in the order of their proximities.
|
||||
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates);
|
||||
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
|
||||
let mut documents = Vec::new();
|
||||
|
||||
// We execute the Mdfs iterator until we find enough documents.
|
||||
@ -364,7 +368,7 @@ impl<'a> Search<'a> {
|
||||
None => documents.into_iter().flatten().take(limit).collect(),
|
||||
};
|
||||
|
||||
Ok(SearchResult { found_words, documents_ids })
|
||||
Ok(SearchResult { found_words, candidates, documents_ids })
|
||||
}
|
||||
}
|
||||
|
||||
@ -383,6 +387,7 @@ impl fmt::Debug for Search<'_> {
|
||||
#[derive(Default)]
|
||||
pub struct SearchResult {
|
||||
pub found_words: HashSet<String>,
|
||||
pub candidates: RoaringBitmap,
|
||||
// TODO those documents ids should be associated with their criteria scores.
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
|
@ -29,6 +29,10 @@ pub struct Opt {
|
||||
|
||||
/// The query string to search for (doesn't support prefix search yet).
|
||||
query: Option<String>,
|
||||
|
||||
/// Compute and print the facet distribution of all the faceted fields.
|
||||
#[structopt(long)]
|
||||
print_facet_distribution: bool,
|
||||
}
|
||||
|
||||
pub fn run(opt: Opt) -> anyhow::Result<()> {
|
||||
@ -71,6 +75,12 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
||||
let _ = writeln!(&mut stdout);
|
||||
}
|
||||
|
||||
if opt.print_facet_distribution {
|
||||
let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
|
||||
serde_json::to_writer(&mut stdout, &facets)?;
|
||||
let _ = writeln!(&mut stdout);
|
||||
}
|
||||
|
||||
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user