Merge pull request #66 from meilisearch/show-available-facets

Expose an API to compute facets distribution
This commit is contained in:
Clément Renault 2021-02-01 18:39:45 +01:00 committed by GitHub
commit fa0cc2dc13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 982 additions and 354 deletions

4
Cargo.lock generated
View File

@ -1211,9 +1211,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237"
[[package]] [[package]]
name = "roaring" name = "roaring"
version = "0.6.3" version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f12bdbc3b9b2fd12148ee9f97f9e36438f1e84d3ce47fec0ad6b4bfbb62b3a35" checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304"
dependencies = [ dependencies = [
"byteorder", "byteorder",
] ]

View File

@ -31,7 +31,7 @@ ordered-float = "2.0.0"
rayon = "1.3.1" rayon = "1.3.1"
regex = "1.4.2" regex = "1.4.2"
ringtail = "0.3.0" ringtail = "0.3.0"
roaring = "0.6.1" roaring = "0.6.4"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = { version = "1.0.59", features = ["preserve_order"] } serde_json = { version = "1.0.59", features = ["preserve_order"] }
slice-group-by = "0.2.6" slice-group-by = "0.2.6"

627
http-ui/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@ tempfile = "3.1.0"
askama = "0.10.1" askama = "0.10.1"
askama_warp = "0.10.0" askama_warp = "0.10.0"
bytes = "0.5.6" bytes = "0.5.6"
either = "1.6.1"
flate2 = "1.0.19" flate2 = "1.0.19"
futures = "0.3.6" futures = "0.3.6"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }

View File

@ -1,9 +1,9 @@
var request = null; var request = null;
var timeoutID = null; var timeoutID = null;
$('#query, #facet').on('input', function () { $('#query, #filters').on('input', function () {
var query = $('#query').val(); var query = $('#query').val();
var facet = $('#facet').val(); var filters = $('#filters').val();
var timeoutMs = 100; var timeoutMs = 100;
if (timeoutID !== null) { if (timeoutID !== null) {
@ -15,18 +15,35 @@ $('#query, #facet').on('input', function () {
type: "POST", type: "POST",
url: "query", url: "query",
contentType: 'application/json', contentType: 'application/json',
data: JSON.stringify({ 'query': query, 'facetCondition': facet }), data: JSON.stringify({
'query': query,
'filters': filters,
"facetDistribution": true,
}),
contentType: 'application/json', contentType: 'application/json',
success: function (data, textStatus, request) { success: function (data, textStatus, request) {
results.innerHTML = ''; results.innerHTML = '';
facets.innerHTML = '';
let timeSpent = request.getResponseHeader('Time-Ms'); let timeSpent = request.getResponseHeader('Time-Ms');
let numberOfDocuments = data.length; let numberOfDocuments = data.documents.length;
count.innerHTML = `${numberOfDocuments}`; count.innerHTML = data.numberOfCandidates.toLocaleString();
time.innerHTML = `${timeSpent}ms`; time.innerHTML = `${timeSpent}ms`;
time.classList.remove('fade-in-out'); time.classList.remove('fade-in-out');
for (element of data) { for (facet_name in data.facets) {
for (value in data.facets[facet_name]) {
const elem = document.createElement('span');
const count = data.facets[facet_name][value];
elem.classList.add("tag");
elem.setAttribute('data-name', facet_name);
elem.setAttribute('data-value', value);
elem.innerHTML = `${facet_name}:${value} (${count})`;
facets.appendChild(elem);
}
}
for (element of data.documents) {
const elem = document.createElement('li'); const elem = document.createElement('li');
elem.classList.add("document"); elem.classList.add("document");
@ -54,6 +71,19 @@ $('#query, #facet').on('input', function () {
results.appendChild(elem); results.appendChild(elem);
} }
// When we click on a tag we append the facet value
// at the end of the facet query.
$('#facets .tag').on('click', function () {
let name = $(this).attr("data-name");
let value = $(this).attr("data-value");
let facet_query = $('#filters').val().trim();
if (facet_query === "") {
$('#filters').val(`${name} = "${value}"`).trigger('input');
} else {
$('#filters').val(`${facet_query} AND ${name} = "${value}"`).trigger('input');
}
});
}, },
beforeSend: function () { beforeSend: function () {
if (request !== null) { if (request !== null) {
@ -65,6 +95,25 @@ $('#query, #facet').on('input', function () {
}, timeoutMs); }, timeoutMs);
}); });
function diffArray(arr1, arr2) {
return arr1.concat(arr2).filter(function (val) {
if (!(arr1.includes(val) && arr2.includes(val)))
return val;
});
}
function selectedFacetsToArray(facets_obj) {
var array = [];
for (const facet_name in facets_obj) {
var subarray = [];
for (const facet_value of facets_obj[facet_name]) {
subarray.push(`${facet_name}:${facet_value}`);
}
array.push(subarray);
}
return array;
}
// Make the number of document a little bit prettier // Make the number of document a little bit prettier
$('#docs-count').text(function(index, text) { $('#docs-count').text(function(index, text) {
return parseInt(text).toLocaleString() return parseInt(text).toLocaleString()
@ -75,8 +124,8 @@ $('#db-size').text(function(index, text) {
return filesize(parseInt(text)) return filesize(parseInt(text))
}); });
// We trigger the input when we load the script, this way // We trigger the input when we load the script.
// we execute a placeholder search when the input is empty.
$(window).on('load', function () { $(window).on('load', function () {
// We execute a placeholder search when the input is empty.
$('#query').trigger('input'); $('#query').trigger('input');
}); });

View File

@ -4,6 +4,23 @@
padding: 0; padding: 0;
} }
#facets .tag {
margin-right: 1em;
margin-bottom: 1em;
}
#facets {
max-width: 900px;
margin: 20px auto 0 auto;
padding: 0;
max-height: 16em;
overflow: scroll;
}
#facets .tag:hover {
cursor: pointer;
}
#logo-white { #logo-white {
display: none; display: none;
} }

View File

@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::fmt::Display; use std::fmt::Display;
use std::fs::{File, create_dir_all}; use std::fs::{File, create_dir_all};
use std::net::SocketAddr; use std::net::SocketAddr;
@ -11,6 +11,7 @@ use std::{mem, io};
use askama_warp::Template; use askama_warp::Template;
use byte_unit::Byte; use byte_unit::Byte;
use either::Either;
use flate2::read::GzDecoder; use flate2::read::GzDecoder;
use futures::stream; use futures::stream;
use futures::{FutureExt, StreamExt}; use futures::{FutureExt, StreamExt};
@ -28,6 +29,7 @@ use warp::filters::ws::Message;
use warp::{Filter, http::Response}; use warp::{Filter, http::Response};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::facet::FacetValue;
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
@ -620,12 +622,38 @@ async fn main() -> anyhow::Result<()> {
.body(include_str!("../public/logo-black.svg")) .body(include_str!("../public/logo-black.svg"))
); );
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum UntaggedEither<L, R> {
Left(L),
Right(R),
}
impl<L, R> From<UntaggedEither<L, R>> for Either<L, R> {
fn from(value: UntaggedEither<L, R>) -> Either<L, R> {
match value {
UntaggedEither::Left(left) => Either::Left(left),
UntaggedEither::Right(right) => Either::Right(right),
}
}
}
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct QueryBody { struct QueryBody {
query: Option<String>, query: Option<String>,
facet_condition: Option<String>, filters: Option<String>,
facet_filters: Option<Vec<UntaggedEither<Vec<String>, String>>>,
facet_distribution: Option<bool>,
}
#[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")]
struct Answer {
documents: Vec<Map<String, Value>>,
number_of_candidates: u64,
facets: BTreeMap<String, BTreeMap<FacetValue, u64>>,
} }
let disable_highlighting = opt.disable_highlighting; let disable_highlighting = opt.disable_highlighting;
@ -642,14 +670,42 @@ async fn main() -> anyhow::Result<()> {
if let Some(query) = query.query { if let Some(query) = query.query {
search.query(query); search.query(query);
} }
if let Some(condition) = query.facet_condition {
if !condition.trim().is_empty() { let filters = match query.filters {
let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); Some(condition) if !condition.trim().is_empty() => {
Some(FacetCondition::from_str(&rtxn, &index, &condition).unwrap())
},
_otherwise => None,
};
let facet_filters = match query.facet_filters {
Some(array) => {
let eithers = array.into_iter().map(Into::into);
FacetCondition::from_array(&rtxn, &index, eithers).unwrap()
},
_otherwise => None,
};
let condition = match (filters, facet_filters) {
(Some(filters), Some(facet_filters)) => {
Some(FacetCondition::And(Box::new(filters), Box::new(facet_filters)))
},
(Some(condition), None) | (None, Some(condition)) => Some(condition),
_otherwise => None,
};
if let Some(condition) = condition {
search.facet_condition(condition); search.facet_condition(condition);
} }
}
let SearchResult { found_words, documents_ids } = search.execute().unwrap(); let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap();
let number_of_candidates = candidates.len();
let facets = if query.facet_distribution == Some(true) {
Some(index.facets_distribution(&rtxn).candidates(candidates).execute().unwrap())
} else {
None
};
let mut documents = Vec::new(); let mut documents = Vec::new();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
@ -674,10 +730,16 @@ async fn main() -> anyhow::Result<()> {
documents.push(object); documents.push(object);
} }
let answer = Answer {
documents,
number_of_candidates,
facets: facets.unwrap_or_default(),
};
Response::builder() Response::builder()
.header("Content-Type", "application/json") .header("Content-Type", "application/json")
.header("Time-Ms", before_search.elapsed().as_millis().to_string()) .header("Time-Ms", before_search.elapsed().as_millis().to_string())
.body(serde_json::to_string(&documents).unwrap()) .body(serde_json::to_string(&answer).unwrap())
}); });
let index_cloned = index.clone(); let index_cloned = index.clone();

View File

@ -56,7 +56,7 @@
<div class="level-item"> <div class="level-item">
<div class="field has-addons has-addons-right"> <div class="field has-addons has-addons-right">
<input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney"> <input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney">
<input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800"> <input id="filters" class="input" type="text" placeholder="filters like released >= 1577836800">
</div> </div>
</div> </div>
<div class="level-item"></div> <div class="level-item"></div>
@ -66,7 +66,7 @@
<nav class="level-right"> <nav class="level-right">
<div class="level-item has-text-centered"> <div class="level-item has-text-centered">
<div> <div>
<p class="heading">Documents</p> <p class="heading">Candidates</p>
<p id="count" class="title">0</p> <p id="count" class="title">0</p>
</div> </div>
</div> </div>
@ -84,6 +84,10 @@
</div> </div>
</section> </section>
<section id="facets">
<!-- facet values -->
</section>
<section> <section>
<ol id="results" class="content"> <ol id="results" class="content">
<!-- documents matching requests --> <!-- documents matching requests -->

60
src/facet/facet_value.rs Normal file
View File

@ -0,0 +1,60 @@
use ordered_float::OrderedFloat;
use serde::{Serialize, Serializer};
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub enum FacetValue {
String(String),
Float(OrderedFloat<f64>),
Integer(i64),
}
impl From<String> for FacetValue {
fn from(string: String) -> FacetValue {
FacetValue::String(string)
}
}
impl From<&str> for FacetValue {
fn from(string: &str) -> FacetValue {
FacetValue::String(string.to_owned())
}
}
impl From<f64> for FacetValue {
fn from(float: f64) -> FacetValue {
FacetValue::Float(OrderedFloat(float))
}
}
impl From<OrderedFloat<f64>> for FacetValue {
fn from(float: OrderedFloat<f64>) -> FacetValue {
FacetValue::Float(float)
}
}
impl From<i64> for FacetValue {
fn from(integer: i64) -> FacetValue {
FacetValue::Integer(integer)
}
}
/// We implement Serialize ourselves because we need to always serialize it as a string,
/// JSON object keys must be strings not numbers.
impl Serialize for FacetValue {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match self {
FacetValue::String(string) => serializer.serialize_str(string),
FacetValue::Float(float) => {
let string = float.to_string();
serializer.serialize_str(&string)
},
FacetValue::Integer(integer) => {
let string = integer.to_string();
serializer.serialize_str(&string)
},
}
}
}

View File

@ -1,4 +1,6 @@
mod facet_type; mod facet_type;
mod facet_value;
pub mod value_encoding; pub mod value_encoding;
pub use self::facet_type::FacetType; pub use self::facet_type::FacetType;
pub use self::facet_value::FacetValue;

View File

@ -9,7 +9,7 @@ use roaring::RoaringBitmap;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
use crate::{default_criteria, Criterion, Search}; use crate::{default_criteria, Criterion, Search, FacetDistribution};
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
use crate::{ use crate::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
@ -351,6 +351,10 @@ impl Index {
Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?) Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?)
} }
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
FacetDistribution::new(rtxn, self)
}
pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> { pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> {
Search::new(rtxn, self) Search::new(rtxn, self)
} }

View File

@ -28,7 +28,7 @@ pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{Search, FacetCondition, SearchResult}; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
pub use self::update_store::UpdateStore; pub use self::update_store::UpdateStore;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -3,6 +3,8 @@ use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded}; use std::ops::Bound::{self, Included, Excluded};
use std::str::FromStr; use std::str::FromStr;
use anyhow::Context;
use either::Either;
use heed::types::{ByteSlice, DecodeIgnore}; use heed::types::{ByteSlice, DecodeIgnore};
use log::debug; use log::debug;
use num_traits::Bounded; use num_traits::Bounded;
@ -141,6 +143,85 @@ where T: FromStr,
} }
impl FacetCondition { impl FacetCondition {
pub fn from_array<I, J, A, B>(
rtxn: &heed::RoTxn,
index: &Index,
array: I,
) -> anyhow::Result<Option<FacetCondition>>
where I: IntoIterator<Item=Either<J, B>>,
J: IntoIterator<Item=A>,
A: AsRef<str>,
B: AsRef<str>,
{
fn facet_condition(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
key: &str,
value: &str,
) -> anyhow::Result<FacetCondition>
{
let fid = fields_ids_map.id(key).with_context(|| {
format!("{:?} isn't present in the fields ids map", key)
})?;
let ftype = faceted_fields.get(key).copied().with_context(|| {
format!("{:?} isn't a faceted field", key)
})?;
let (neg, value) = match value.trim().strip_prefix('-') {
Some(value) => (true, value.trim()),
None => (false, value.trim()),
};
let operator = match ftype {
FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)),
FacetType::Float => OperatorF64(fid, FacetNumberOperator::Equal(value.parse()?)),
FacetType::Integer => OperatorI64(fid, FacetNumberOperator::Equal(value.parse()?)),
};
if neg { Ok(operator.negate()) } else { Ok(operator) }
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let mut ands = None;
for either in array {
match either {
Either::Left(array) => {
let mut ors = None;
for rule in array {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
ors = match ors.take() {
Some(ors) => Some(Or(Box::new(ors), Box::new(condition))),
None => Some(condition),
};
}
if let Some(rule) = ors {
ands = match ands.take() {
Some(ands) => Some(And(Box::new(ands), Box::new(rule))),
None => Some(rule),
};
}
},
Either::Right(rule) => {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
ands = match ands.take() {
Some(ands) => Some(And(Box::new(ands), Box::new(condition))),
None => Some(condition),
};
}
}
}
Ok(ands)
}
pub fn from_str( pub fn from_str(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
@ -641,4 +722,35 @@ mod tests {
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
#[test]
fn from_array() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "integer".into(),
});
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_array(
&rtxn, &index,
vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])],
).unwrap().unwrap();
let expected = FacetCondition::from_str(
&rtxn, &index,
"channel = gotaga AND (timestamp = 44 OR channel != ponce)",
).unwrap();
assert_eq!(condition, expected);
}
} }

View File

@ -0,0 +1,260 @@
use std::collections::{HashSet, BTreeMap};
use std::ops::Bound::Unbounded;
use std::{cmp, fmt};
use anyhow::Context;
use heed::BytesDecode;
use roaring::RoaringBitmap;
use crate::facet::{FacetType, FacetValue};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId};
/// The default number of values by facets that will
/// be fetched from the key-value store.
const DEFAULT_VALUES_BY_FACET: usize = 100;
/// The hard limit in the number of values by facets that will be fetched from
/// the key-value store. Searching for more values could slow down the engine.
const MAX_VALUES_BY_FACET: usize = 1000;
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000;
pub struct FacetDistribution<'a> {
facets: Option<HashSet<String>>,
candidates: Option<RoaringBitmap>,
max_values_by_facet: usize,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
}
impl<'a> FacetDistribution<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> {
FacetDistribution {
facets: None,
candidates: None,
max_values_by_facet: DEFAULT_VALUES_BY_FACET,
rtxn,
index,
}
}
pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect());
self
}
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
self.candidates = Some(candidates);
self
}
pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self {
self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET);
self
}
/// There is a small amount of candidates OR we ask for facet string values so we
/// decide to iterate over the facet values of each one of them, one by one.
fn facet_values_from_documents(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
fn fetch_facet_values<'t, KC, K: 't>(
index: &Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
where
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
K: Into<FacetValue>,
{
let mut facet_values = BTreeMap::new();
let mut key_buffer = vec![field_id];
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1);
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = index.field_id_docid_facet_values
.prefix_iter(rtxn, &key_buffer)?
.remap_key_type::<KC>();
for result in iter {
let ((_, _, value), ()) = result?;
*facet_values.entry(value.into()).or_insert(0) += 1;
}
}
Ok(facet_values)
}
let index = self.index;
let rtxn = self.rtxn;
match facet_type {
FacetType::String => {
fetch_facet_values::<FieldDocIdFacetStringCodec, _>(index, rtxn, field_id, candidates)
},
FacetType::Float => {
fetch_facet_values::<FieldDocIdFacetF64Codec, _>(index, rtxn, field_id, candidates)
},
FacetType::Integer => {
fetch_facet_values::<FieldDocIdFacetI64Codec, _>(index, rtxn, field_id, candidates)
},
}
}
/// There is too much documents, we use the facet levels to move throught
/// the facet values, to find the candidates and values associated.
fn facet_values_from_facet_levels(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
let iter = match facet_type {
FacetType::String => unreachable!(),
FacetType::Float => {
let iter = FacetIter::<f64, FacetLevelValueF64Codec>::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Integer => {
let iter = FacetIter::<i64, FacetLevelValueI64Codec>::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))))
},
};
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, mut docids) = result?;
docids.intersect_with(candidates);
if !docids.is_empty() {
facet_values.insert(value, docids.len());
}
if facet_values.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
}
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
/// facet values one by one and iterate on the facet level 0 for numbers.
fn facet_values_from_raw_facet_database(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
let db = self.index.facet_field_id_value_docids;
let level = 0;
let iter = match facet_type {
FacetType::String => {
let iter = db
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Float => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
let range = FacetRange::<f64, _>::new(
self.rtxn, db, field_id, level, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
},
FacetType::Integer => {
let db = db.remap_key_type::<FacetLevelValueI64Codec>();
let range = FacetRange::<i64, _>::new(
self.rtxn, db, field_id, level, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
},
};
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, docids) = result?;
facet_values.insert(value, docids.len());
if facet_values.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
}
fn facet_values(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
if let Some(candidates) = self.candidates.as_ref() {
// Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons.
if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String {
self.facet_values_from_documents(field_id, facet_type, candidates)
} else {
self.facet_values_from_facet_levels(field_id, facet_type, candidates)
}
} else {
self.facet_values_from_raw_facet_database(field_id, facet_type)
}
}
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<FacetValue, u64>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
let fields_ids: Vec<_> = match &self.facets {
Some(names) => names
.iter()
.filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t)))
.collect(),
None => faceted_fields.into_iter().collect(),
};
let mut facets_values = BTreeMap::new();
for (name, ftype) in fields_ids {
let fid = fields_ids_map.id(&name).with_context(|| {
format!("missing field name {:?} from the fields id map", name)
})?;
let values = self.facet_values(fid, ftype)?;
facets_values.insert(name, values);
}
Ok(facets_values)
}
}
impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let FacetDistribution {
facets,
candidates,
max_values_by_facet,
rtxn: _,
index: _,
} = self;
f.debug_struct("FacetDistribution")
.field("facets", facets)
.field("candidates", candidates)
.field("max_values_by_facet", max_values_by_facet)
.finish()
}
}

View File

@ -13,11 +13,13 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{Index, FieldId}; use crate::{Index, FieldId};
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator};
pub use self::facet_distribution::FacetDistribution;
mod facet_condition; mod facet_condition;
mod facet_distribution;
mod parser; mod parser;
struct FacetRange<'t, T: 't, KC> { pub struct FacetRange<'t, T: 't, KC> {
iter: RoRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>, iter: RoRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
end: Bound<T>, end: Bound<T>,
} }
@ -27,7 +29,7 @@ where
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
T: PartialOrd + Copy + Bounded, T: PartialOrd + Copy + Bounded,
{ {
fn new( pub fn new(
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
db: Database<KC, CboRoaringBitmapCodec>, db: Database<KC, CboRoaringBitmapCodec>,
field_id: FieldId, field_id: FieldId,
@ -78,7 +80,7 @@ where
} }
} }
struct FacetRevRange<'t, T: 't, KC> { pub struct FacetRevRange<'t, T: 't, KC> {
iter: RoRevRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>, iter: RoRevRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
end: Bound<T>, end: Bound<T>,
} }
@ -88,7 +90,7 @@ where
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
T: PartialOrd + Copy + Bounded, T: PartialOrd + Copy + Bounded,
{ {
fn new( pub fn new(
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
db: Database<KC, CboRoaringBitmapCodec>, db: Database<KC, CboRoaringBitmapCodec>,
field_id: FieldId, field_id: FieldId,
@ -145,6 +147,7 @@ pub struct FacetIter<'t, T: 't, KC> {
db: Database<KC, CboRoaringBitmapCodec>, db: Database<KC, CboRoaringBitmapCodec>,
field_id: FieldId, field_id: FieldId,
level_iters: Vec<(RoaringBitmap, Either<FacetRange<'t, T, KC>, FacetRevRange<'t, T, KC>>)>, level_iters: Vec<(RoaringBitmap, Either<FacetRange<'t, T, KC>, FacetRevRange<'t, T, KC>>)>,
must_reduce: bool,
} }
impl<'t, T, KC> FacetIter<'t, T, KC> impl<'t, T, KC> FacetIter<'t, T, KC>
@ -153,7 +156,10 @@ where
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
T: PartialOrd + Copy + Bounded, T: PartialOrd + Copy + Bounded,
{ {
pub fn new( /// Create a `FacetIter` that will iterate on the different facet entries
/// (facet value + documents ids) and that will reduce the given documents ids
/// while iterating on the different facet levels.
pub fn new_reducing(
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
index: &'t Index, index: &'t Index,
field_id: FieldId, field_id: FieldId,
@ -163,10 +169,14 @@ where
let db = index.facet_field_id_value_docids.remap_key_type::<KC>(); let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] }) let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
} }
pub fn new_reverse( /// Create a `FacetIter` that will iterate on the different facet entries in reverse
/// (facet value + documents ids) and that will reduce the given documents ids
/// while iterating on the different facet levels.
pub fn new_reverse_reducing(
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
index: &'t Index, index: &'t Index,
field_id: FieldId, field_id: FieldId,
@ -176,7 +186,26 @@ where
let db = index.facet_field_id_value_docids.remap_key_type::<KC>(); let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] }) let level_iters = vec![(documents_ids, Right(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
}
/// Create a `FacetIter` that will iterate on the different facet entries
/// (facet value + documents ids) and that will not reduce the given documents ids
/// while iterating on the different facet levels, possibly returning multiple times
/// a document id associated with multiple facet values.
pub fn new_non_reducing(
rtxn: &'t heed::RoTxn,
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t, T, KC>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<KC>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
} }
fn highest_level<X>(rtxn: &'t heed::RoTxn, db: Database<KC, X>, fid: FieldId) -> heed::Result<Option<u8>> { fn highest_level<X>(rtxn: &'t heed::RoTxn, db: Database<KC, X>, fid: FieldId) -> heed::Result<Option<u8>> {
@ -214,7 +243,9 @@ where
docids.intersect_with(&documents_ids); docids.intersect_with(&documents_ids);
if !docids.is_empty() { if !docids.is_empty() {
if self.must_reduce {
documents_ids.difference_with(&docids); documents_ids.difference_with(&docids);
}
if level == 0 { if level == 0 {
debug!("found {:?} at {:?}", docids, left); debug!("found {:?} at {:?}", docids, left);

View File

@ -20,7 +20,7 @@ use crate::mdfs::Mdfs;
use crate::query_tokens::{query_tokens, QueryToken}; use crate::query_tokens::{query_tokens, QueryToken};
use crate::{Index, FieldId, DocumentId, Criterion}; use crate::{Index, FieldId, DocumentId, Criterion};
pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::facet::{FacetIter}; pub use self::facet::{FacetIter};
// Building these factories is not free. // Building these factories is not free.
@ -189,9 +189,9 @@ impl<'a> Search<'a> {
} }
} else { } else {
let facet_fn = if ascending { let facet_fn = if ascending {
FacetIter::<f64, FacetLevelValueF64Codec>::new FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
} else { } else {
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
}; };
let mut limit_tmp = limit; let mut limit_tmp = limit;
let mut output = Vec::new(); let mut output = Vec::new();
@ -226,9 +226,9 @@ impl<'a> Search<'a> {
} }
} else { } else {
let facet_fn = if ascending { let facet_fn = if ascending {
FacetIter::<i64, FacetLevelValueI64Codec>::new FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
} else { } else {
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
}; };
let mut limit_tmp = limit; let mut limit_tmp = limit;
let mut output = Vec::new(); let mut output = Vec::new();
@ -313,22 +313,26 @@ impl<'a> Search<'a> {
// there is some facet conditions we return a placeholder. // there is some facet conditions we return a placeholder.
let documents_ids = match order_by_facet { let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => { Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)? self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
}, },
None => facet_candidates.iter().take(limit).collect(), None => facet_candidates.iter().take(limit).collect(),
}; };
return Ok(SearchResult { documents_ids, ..Default::default() }) return Ok(SearchResult {
documents_ids,
candidates: facet_candidates,
..Default::default()
})
}, },
(None, None) => { (None, None) => {
// If the query is not set or results in no DFAs we return a placeholder. // If the query is not set or results in no DFAs we return a placeholder.
let documents_ids = self.index.documents_ids(self.rtxn)?; let all_docids = self.index.documents_ids(self.rtxn)?;
let documents_ids = match order_by_facet { let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => { Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)? self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
}, },
None => documents_ids.iter().take(limit).collect(), None => all_docids.iter().take(limit).collect(),
}; };
return Ok(SearchResult { documents_ids, ..Default::default() }) return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
}, },
}; };
@ -336,7 +340,7 @@ impl<'a> Search<'a> {
// The mana depth first search is a revised DFS that explore // The mana depth first search is a revised DFS that explore
// solutions in the order of their proximities. // solutions in the order of their proximities.
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates); let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
let mut documents = Vec::new(); let mut documents = Vec::new();
// We execute the Mdfs iterator until we find enough documents. // We execute the Mdfs iterator until we find enough documents.
@ -364,7 +368,7 @@ impl<'a> Search<'a> {
None => documents.into_iter().flatten().take(limit).collect(), None => documents.into_iter().flatten().take(limit).collect(),
}; };
Ok(SearchResult { found_words, documents_ids }) Ok(SearchResult { found_words, candidates, documents_ids })
} }
} }
@ -383,6 +387,7 @@ impl fmt::Debug for Search<'_> {
#[derive(Default)] #[derive(Default)]
pub struct SearchResult { pub struct SearchResult {
pub found_words: HashSet<String>, pub found_words: HashSet<String>,
pub candidates: RoaringBitmap,
// TODO those documents ids should be associated with their criteria scores. // TODO those documents ids should be associated with their criteria scores.
pub documents_ids: Vec<DocumentId>, pub documents_ids: Vec<DocumentId>,
} }

View File

@ -29,6 +29,10 @@ pub struct Opt {
/// The query string to search for (doesn't support prefix search yet). /// The query string to search for (doesn't support prefix search yet).
query: Option<String>, query: Option<String>,
/// Compute and print the facet distribution of all the faceted fields.
#[structopt(long)]
print_facet_distribution: bool,
} }
pub fn run(opt: Opt) -> anyhow::Result<()> { pub fn run(opt: Opt) -> anyhow::Result<()> {
@ -71,6 +75,12 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let _ = writeln!(&mut stdout); let _ = writeln!(&mut stdout);
} }
if opt.print_facet_distribution {
let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
serde_json::to_writer(&mut stdout, &facets)?;
let _ = writeln!(&mut stdout);
}
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
} }