184: Transfer numbers and strings facets into the appropriate facet databases r=Kerollmops a=Kerollmops

This pull request is related to https://github.com/meilisearch/milli/issues/152 and changes the layout of the facets values, numbers and strings are now in dedicated databases and the user no more needs to define the type of the fields. No more conversion between the two types is done, numbers (floats and integers converted to f64) go to the facet float database and strings go to the strings facet database.

There is one related issue that I found regarding CSVs, the values in a CSV are always considered to be strings, [meilisearch/specifications#28](d916b57d74/text/0028-indexing-csv.md) fixes this issue by allowing the user to define the fields types using `:` in the "CSV Formatting Rules" section.

All previous tests on facets have been modified to pass again and I have also done hand-driven tests with the 115m songs dataset. Everything seems to be good!

Fixes #192.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2021-05-31 13:32:58 +00:00 committed by GitHub
commit 2f5e61bacb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 1046 additions and 963 deletions

View file

@ -1,18 +1,15 @@
use std::collections::HashMap;
use std::mem::take;
use anyhow::{bail, Context as _};
use anyhow::Context;
use itertools::Itertools;
use log::debug;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetF64Codec;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation;
use crate::{FieldsIdsMap, FieldId, Index};
use crate::{FieldId, Index};
use super::{Criterion, CriterionParameters, CriterionResult};
/// Threshold on the number of candidates that will make
@ -24,7 +21,6 @@ pub struct AscDesc<'t> {
rtxn: &'t heed::RoTxn<'t>,
field_name: String,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
query_tree: Option<Operation>,
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
@ -39,8 +35,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
Self::new(index, rtxn, parent, field_name, true)
}
@ -49,8 +44,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
Self::new(index, rtxn, parent, field_name, false)
}
@ -60,22 +54,21 @@ impl<'t> AscDesc<'t> {
parent: Box<dyn Criterion + 't>,
field_name: String,
ascending: bool,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {:?} isn't registered", field_name))?;
Ok(AscDesc {
index,
rtxn,
field_name,
field_id,
facet_type,
ascending,
query_tree: None,
candidates: Box::new(std::iter::empty()),
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?,
bucket_candidates: RoaringBitmap::new(),
parent,
})
@ -86,8 +79,10 @@ impl<'t> Criterion for AscDesc<'t> {
#[logging_timer::time("AscDesc::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
loop {
debug!("Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" }, self.field_name
debug!(
"Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" },
self.field_name
);
match self.candidates.next().transpose()? {
@ -122,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> {
self.index,
self.rtxn,
self.field_id,
self.facet_type,
self.ascending,
candidates,
)?;
@ -138,27 +132,12 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
}
}
}
}
fn field_id_facet_type(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
field: &str,
) -> anyhow::Result<(FieldId, FacetType)>
{
let id = fields_ids_map.id(field).with_context(|| {
format!("field {:?} isn't registered", field)
})?;
let facet_type = faceted_fields.get(field).with_context(|| {
format!("field {:?} isn't faceted", field)
})?;
Ok((id, *facet_type))
}
/// Returns an iterator over groups of the given candidates in ascending or descending order.
///
/// It will either use an iterative or a recursive method on the whole facet database depending
@ -167,29 +146,20 @@ fn facet_ordered<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
candidates: RoaringBitmap,
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>>
{
match facet_type {
FacetType::Number => {
if candidates.len() <= CANDIDATES_THRESHOLD {
let iter = iterative_facet_ordered_iter(
index, rtxn, field_id, ascending, candidates,
)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else {
let facet_fn = if ascending {
FacetIter::new_reducing
} else {
FacetIter::new_reverse_reducing
};
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
},
FacetType::String => bail!("criteria facet type must be a number"),
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
if candidates.len() <= CANDIDATES_THRESHOLD {
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else {
let facet_fn = if ascending {
FacetIter::new_reducing
} else {
FacetIter::new_reverse_reducing
};
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
}
@ -202,14 +172,14 @@ fn iterative_facet_ordered_iter<'t>(
field_id: FieldId,
ascending: bool,
candidates: RoaringBitmap,
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't>
{
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?;
let mut iter = index
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
@ -226,7 +196,8 @@ fn iterative_facet_ordered_iter<'t>(
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter.group_by(|(_, v)| *v)
let vec: Vec<_> = iter
.group_by(|(_, v)| v.clone())
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();

View file

@ -1,10 +1,14 @@
use std::mem::size_of;
use heed::types::ByteSlice;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter};
use crate::heed_codec::facet::*;
use crate::{DocumentId, FieldId, Index};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
/// A distinct implementer that is backed by facets.
///
@ -18,21 +22,14 @@ pub struct FacetDistinct<'a> {
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
}
impl<'a> FacetDistinct<'a> {
pub fn new(
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
) -> Self {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
Self {
distinct,
index,
txn,
facet_type,
}
}
}
@ -41,38 +38,33 @@ pub struct FacetDistinctIter<'a> {
candidates: RoaringBitmap,
distinct: FieldId,
excluded: RoaringBitmap,
facet_type: FacetType,
index: &'a Index,
iter_offset: usize,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> FacetDistinctIter<'a> {
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap>
where
KC: heed::BytesEncode<'c>,
{
let facet_docids = self
.index
.facet_field_id_value_docids
.remap_key_type::<KC>()
.get(self.txn, key)?
.expect("Corrupted data: Facet values must exist");
Ok(facet_docids)
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index
.facet_id_string_docids
.get(self.txn, &(self.distinct, key))
}
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
// get facet docids on level 0
self.index
.facet_id_f64_docids
.get(self.txn, &(self.distinct, 0, key, key))
}
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetStringCodec>(
id,
self.distinct,
self.index,
self.txn,
)?;
let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
let key = (self.distinct, value);
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?;
let facet_docids = self
.facet_string_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids);
}
@ -82,17 +74,13 @@ impl<'a> FacetDistinctIter<'a> {
}
fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id,
self.distinct,
self.index,
self.txn,
)?;
let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
// get facet docids on level 0
let key = (self.distinct, 0, value, value);
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?;
let facet_docids = self
.facet_number_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids);
}
@ -111,16 +99,16 @@ impl<'a> FacetDistinctIter<'a> {
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
match candidates_iter.next() {
Some(id) => {
match self.facet_type {
FacetType::String => self.distinct_string(id)?,
FacetType::Number => self.distinct_number(id)?,
};
// We distinct the document id on its facet strings and facet numbers.
self.distinct_string(id)?;
self.distinct_number(id)?;
// The first document of each iteration is kept, since the next call to
// `difference_with` will filter out all the documents for that facet value. By
// increasing the offset we make sure to get the first valid value for the next
// distinct document to keep.
self.iter_offset += 1;
Ok(Some(id))
}
// no more candidate at this offset, return.
@ -129,26 +117,44 @@ impl<'a> FacetDistinctIter<'a> {
}
}
fn get_facet_values<'a, KC>(
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
key
}
fn facet_number_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>>
where
KC: heed::BytesDecode<'a>,
{
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_values
.field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<KC>();
.remap_key_type::<FieldDocIdFacetF64Codec>();
Ok(iter)
}
fn facet_string_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
Ok(iter)
}
@ -174,7 +180,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
candidates,
distinct: self.distinct,
excluded,
facet_type: self.facet_type,
index: self.index,
iter_offset: 0,
txn: self.txn,
@ -184,22 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
#[cfg(test)]
mod test {
use std::collections::HashMap;
use std::collections::HashSet;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates};
use crate::facet::FacetType;
use super::*;
macro_rules! test_facet_distinct {
($name:ident, $distinct:literal, $facet_type:expr) => {
($name:ident, $distinct:literal) => {
#[test]
fn $name() {
use std::iter::FromIterator;
let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string())));
let facets = HashSet::from_iter(Some(($distinct.to_string())));
let (index, fid, candidates) = generate_index($distinct, facets);
let txn = index.read_txn().unwrap();
let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);
let mut map_distinct = FacetDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new();
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
@ -209,7 +213,7 @@ mod test {
};
}
test_facet_distinct!(test_string, "txt", FacetType::String);
test_facet_distinct!(test_strings, "txts", FacetType::String);
test_facet_distinct!(test_number, "cat-int", FacetType::Number);
test_facet_distinct!(test_string, "txt");
test_facet_distinct!(test_strings, "txts");
test_facet_distinct!(test_number, "cat-int");
}

View file

@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
#[cfg(test)]
mod test {
use std::collections::HashMap;
use std::collections::HashSet;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates};
@ -119,7 +119,7 @@ mod test {
($name:ident, $distinct:literal) => {
#[test]
fn $name() {
let (index, fid, candidates) = generate_index($distinct, HashMap::new());
let (index, fid, candidates) = generate_index($distinct, HashSet::new());
let txn = index.read_txn().unwrap();
let mut map_distinct = MapDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new();

View file

@ -28,7 +28,7 @@ pub trait Distinct<'a> {
#[cfg(test)]
mod test {
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use once_cell::sync::Lazy;
use rand::{seq::SliceRandom, Rng};
@ -74,7 +74,7 @@ mod test {
/// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids.
pub(crate) fn generate_index(distinct: &str, facets: HashMap<String, String>) -> (TempIndex, FieldId, RoaringBitmap) {
pub(crate) fn generate_index(distinct: &str, facets: HashSet<String>) -> (TempIndex, FieldId, RoaringBitmap) {
let index = TempIndex::new();
let mut txn = index.write_txn().unwrap();

View file

@ -1,9 +1,8 @@
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded};
use std::str::FromStr;
use anyhow::Context;
use either::Either;
use heed::types::DecodeIgnore;
use log::debug;
@ -12,7 +11,6 @@ use pest::iterators::{Pair, Pairs};
use pest::Parser;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec};
@ -21,122 +19,96 @@ use super::parser::Rule;
use super::parser::{PREC_CLIMBER, FilterParser};
use self::FacetCondition::*;
use self::FacetNumberOperator::*;
use self::Operator::*;
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum FacetNumberOperator {
#[derive(Debug, Clone, PartialEq)]
pub enum Operator {
GreaterThan(f64),
GreaterThanOrEqual(f64),
Equal(f64),
NotEqual(f64),
Equal(Option<f64>, String),
NotEqual(Option<f64>, String),
LowerThan(f64),
LowerThanOrEqual(f64),
Between(f64, f64),
}
impl FacetNumberOperator {
impl Operator {
/// This method can return two operations in case it must express
/// an OR operation for the between case (i.e. `TO`).
fn negate(self) -> (Self, Option<Self>) {
match self {
GreaterThan(x) => (LowerThanOrEqual(x), None),
GreaterThanOrEqual(x) => (LowerThan(x), None),
Equal(x) => (NotEqual(x), None),
NotEqual(x) => (Equal(x), None),
LowerThan(x) => (GreaterThanOrEqual(x), None),
LowerThanOrEqual(x) => (GreaterThan(x), None),
Between(x, y) => (LowerThan(x), Some(GreaterThan(y))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetStringOperator {
Equal(String),
NotEqual(String),
}
impl FacetStringOperator {
fn equal(s: &str) -> Self {
FacetStringOperator::Equal(s.to_lowercase())
}
#[allow(dead_code)]
fn not_equal(s: &str) -> Self {
FacetStringOperator::equal(s).negate()
}
fn negate(self) -> Self {
match self {
FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x),
FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x),
GreaterThan(n) => (LowerThanOrEqual(n), None),
GreaterThanOrEqual(n) => (LowerThan(n), None),
Equal(n, s) => (NotEqual(n, s), None),
NotEqual(n, s) => (Equal(n, s), None),
LowerThan(n) => (GreaterThanOrEqual(n), None),
LowerThanOrEqual(n) => (GreaterThan(n), None),
Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetCondition {
OperatorString(FieldId, FacetStringOperator),
OperatorNumber(FieldId, FacetNumberOperator),
Operator(FieldId, Operator),
Or(Box<Self>, Box<Self>),
And(Box<Self>, Box<Self>),
}
fn get_field_id_facet_type<'a>(
fn field_id(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
items: &mut Pairs<'a, Rule>,
) -> Result<(FieldId, FacetType), PestError<Rule>>
faceted_fields: &HashSet<FieldId>,
items: &mut Pairs<Rule>,
) -> Result<FieldId, PestError<Rule>>
{
// lexing ensures that we at least have a key
let key = items.next().unwrap();
let field_id = fields_ids_map
.id(key.as_str())
.ok_or_else(|| {
PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ")
),
},
key.as_span(),
)
})?;
let facet_type = faceted_fields
.get(&field_id)
.copied()
.ok_or_else(|| {
PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` is not faceted, available faceted attributes are: {}",
key.as_str(),
faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ")
),
},
key.as_span(),
)
})?;
let field_id = match fields_ids_map.id(key.as_str()) {
Some(field_id) => field_id,
None => return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "),
),
},
key.as_span(),
)),
};
Ok((field_id, facet_type))
if !faceted_fields.contains(&field_id) {
return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` is not faceted, available faceted attributes are: {}",
key.as_str(),
faceted_fields.iter().flat_map(|id| {
fields_ids_map.name(*id)
}).collect::<Vec<_>>().join(", "),
),
},
key.as_span(),
));
}
Ok(field_id)
}
fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>>
fn pest_parse<T>(pair: Pair<Rule>) -> (Result<T, pest::error::Error<Rule>>, String)
where T: FromStr,
T::Err: ToString,
{
match pair.as_str().parse() {
let result = match pair.as_str().parse::<T>() {
Ok(value) => Ok(value),
Err(e) => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError { message: e.to_string() },
pair.as_span(),
))
}
}
Err(e) => Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError { message: e.to_string() },
pair.as_span(),
)),
};
(result, pair.as_str().to_string())
}
impl FacetCondition {
@ -150,34 +122,6 @@ impl FacetCondition {
A: AsRef<str>,
B: AsRef<str>,
{
fn facet_condition(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
key: &str,
value: &str,
) -> anyhow::Result<FacetCondition>
{
let fid = fields_ids_map.id(key).with_context(|| {
format!("{:?} isn't present in the fields ids map", key)
})?;
let ftype = faceted_fields.get(key).copied().with_context(|| {
format!("{:?} isn't a faceted field", key)
})?;
let (neg, value) = match value.trim().strip_prefix('-') {
Some(value) => (true, value.trim()),
None => (false, value.trim()),
};
let operator = match ftype {
FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)),
FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)),
};
if neg { Ok(operator.negate()) } else { Ok(operator) }
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let mut ands = None;
for either in array {
@ -185,10 +129,7 @@ impl FacetCondition {
Either::Left(array) => {
let mut ors = None;
for rule in array {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
ors = match ors.take() {
Some(ors) => Some(Or(Box::new(ors), Box::new(condition))),
None => Some(condition),
@ -203,10 +144,7 @@ impl FacetCondition {
}
},
Either::Right(rule) => {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
ands = match ands.take() {
Some(ands) => Some(And(Box::new(ands), Box::new(condition))),
None => Some(condition),
@ -232,7 +170,7 @@ impl FacetCondition {
fn from_pairs(
fim: &FieldsIdsMap,
ff: &HashMap<FieldId, FacetType>,
ff: &HashSet<FieldId>,
expression: Pairs<Rule>,
) -> anyhow::Result<Self>
{
@ -263,10 +201,9 @@ impl FacetCondition {
fn negate(self) -> FacetCondition {
match self {
OperatorString(fid, op) => OperatorString(fid, op.negate()),
OperatorNumber(fid, op) => match op.negate() {
(op, None) => OperatorNumber(fid, op),
(a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))),
Operator(fid, op) => match op.negate() {
(op, None) => Operator(fid, op),
(a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))),
},
Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())),
And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())),
@ -275,137 +212,96 @@ impl FacetCondition {
fn between(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let lvalue = items.next().unwrap();
let rvalue = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => {
let lvalue = pest_parse(lvalue)?;
let rvalue = pest_parse(rvalue)?;
Ok(OperatorNumber(fid, Between(lvalue, rvalue)))
},
}
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let (lresult, _) = pest_parse(items.next().unwrap());
let (rresult, _) = pest_parse(items.next().unwrap());
let lvalue = lresult?;
let rvalue = rresult?;
Ok(Operator(fid, Between(lvalue, rvalue)))
}
fn equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))),
FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))),
}
let (result, svalue) = pest_parse(value);
let svalue = svalue.to_lowercase();
Ok(Operator(fid, Equal(result.ok(), svalue)))
}
fn greater_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, GreaterThan(result?)))
}
fn greater_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, GreaterThanOrEqual(result?)))
}
fn lower_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, LowerThan(result?)))
}
fn lower_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, LowerThanOrEqual(result?)))
}
}
@ -485,34 +381,53 @@ impl FacetCondition {
Ok(())
}
fn evaluate_number_operator<>(
fn evaluate_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: FacetNumberOperator,
operator: &Operator,
) -> anyhow::Result<RoaringBitmap>
{
// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
let (left, right) = match operator {
GreaterThan(val) => (Excluded(val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)),
Equal(val) => (Included(val), Included(val)),
NotEqual(val) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?;
return Ok(all_documents_ids - docids);
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
Equal(number, string) => {
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
let number_docids = match number {
Some(n) => {
let n = Included(*n);
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?;
output
},
None => RoaringBitmap::new(),
};
return Ok(string_docids | number_docids);
},
LowerThan(val) => (Included(f64::MIN), Excluded(val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)),
Between(left, right) => (Included(left), Included(right)),
NotEqual(number, string) => {
let all_numbers_ids = if number.is_some() {
index.number_faceted_documents_ids(rtxn, field_id)?
} else {
RoaringBitmap::new()
};
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Equal(*number, string.clone());
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
},
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
Between(left, right) => (Included(*left), Included(*right)),
};
// Ask for the biggest value that can exist for this specific field, if it exists
// that's fine if it don't, the value just before will be returned instead.
let biggest_level = db
let biggest_level = numbers_db
.remap_data_type::<DecodeIgnore>()
.get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))?
.and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None });
@ -520,52 +435,25 @@ impl FacetCondition {
match biggest_level {
Some(level) => {
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?;
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?;
Ok(output)
},
None => Ok(RoaringBitmap::new()),
}
}
fn evaluate_string_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: &FacetStringOperator,
) -> anyhow::Result<RoaringBitmap>
{
match operator {
FacetStringOperator::Equal(string) => {
match db.get(rtxn, &(field_id, string))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new())
}
},
FacetStringOperator::NotEqual(string) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let op = FacetStringOperator::Equal(string.clone());
let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?;
Ok(all_documents_ids - docids)
},
}
}
pub fn evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
) -> anyhow::Result<RoaringBitmap>
{
let db = index.facet_field_id_value_docids;
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
match self {
OperatorString(fid, op) => {
let db = db.remap_key_type::<FacetValueStringCodec>();
Self::evaluate_string_operator(rtxn, index, db, *fid, op)
},
OperatorNumber(fid, op) => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
Self::evaluate_number_operator(rtxn, index, db, *fid, *op)
Operator(fid, op) => {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
},
Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
@ -586,7 +474,8 @@ mod tests {
use super::*;
use crate::update::Settings;
use heed::EnvOpenOptions;
use maplit::hashmap;
use maplit::hashset;
use big_s::S;
#[test]
fn string() {
@ -598,22 +487,22 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() });
builder.set_faceted_fields(hashset!{ S("channel") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::equal("Ponce"));
let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap();
let expected = Operator(0, Operator::Equal(None, S("ponce")));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected);
}
@ -627,20 +516,20 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() });
builder.set_faceted_fields(hashset!{ "timestamp".into() });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
let expected = OperatorNumber(0, Between(22.0, 44.0));
let expected = Operator(0, Between(22.0, 44.0));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or(
Box::new(OperatorNumber(0, LowerThan(22.0))),
Box::new(OperatorNumber(0, GreaterThan(44.0))),
Box::new(Operator(0, LowerThan(22.0))),
Box::new(Operator(0, GreaterThan(44.0))),
);
assert_eq!(condition, expected);
}
@ -655,11 +544,8 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -670,10 +556,10 @@ mod tests {
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))),
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(And(
Box::new(OperatorNumber(1, Between(22.0, 44.0))),
Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))),
Box::new(Operator(1, Between(22.0, 44.0))),
Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
))
);
assert_eq!(condition, expected);
@ -683,13 +569,13 @@ mod tests {
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))),
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(Or(
Box::new(Or(
Box::new(OperatorNumber(1, LowerThan(22.0))),
Box::new(OperatorNumber(1, GreaterThan(44.0))),
Box::new(Operator(1, LowerThan(22.0))),
Box::new(Operator(1, GreaterThan(44.0))),
)),
Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))),
Box::new(Operator(0, Operator::Equal(None, S("ponce")))),
)),
);
assert_eq!(condition, expected);
@ -705,11 +591,8 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -717,7 +600,7 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_array(
&rtxn, &index,
vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])],
vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])],
).unwrap().unwrap();
let expected = FacetCondition::from_str(
&rtxn, &index,

View file

@ -3,12 +3,12 @@ use std::ops::Bound::Unbounded;
use std::{cmp, fmt};
use anyhow::Context;
use heed::BytesDecode;
use heed::{Database, BytesDecode};
use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap;
use crate::facet::{FacetType, FacetValue};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::facet::FacetType;
use crate::heed_codec::facet::FacetValueStringCodec;
use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId};
@ -60,86 +60,81 @@ impl<'a> FacetDistribution<'a> {
/// There is a small amount of candidates OR we ask for facet string values so we
/// decide to iterate over the facet values of each one of them, one by one.
fn facet_values_from_documents(
fn facet_distribution_from_documents(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
fn fetch_facet_values<'t, KC, K: 't>(
index: &Index,
rtxn: &'t heed::RoTxn,
db: Database<KC, Unit>,
field_id: FieldId,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
where
K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
K: Into<FacetValue>,
{
let mut facet_values = BTreeMap::new();
let mut key_buffer = vec![field_id];
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1);
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = index.field_id_docid_facet_values
let iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &key_buffer)?
.remap_key_type::<KC>();
for result in iter {
let ((_, _, value), ()) = result?;
*facet_values.entry(value.into()).or_insert(0) += 1;
*distribution.entry(value.to_string()).or_insert(0) += 1;
}
}
Ok(facet_values)
Ok(())
}
let index = self.index;
let rtxn = self.rtxn;
match facet_type {
FacetType::String => {
fetch_facet_values::<FieldDocIdFacetStringCodec, _>(index, rtxn, field_id, candidates)
},
FacetType::Number => {
fetch_facet_values::<FieldDocIdFacetF64Codec, _>(index, rtxn, field_id, candidates)
let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
},
FacetType::String => {
let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
}
}
}
/// There is too much documents, we use the facet levels to move throught
/// the facet values, to find the candidates and values associated.
fn facet_values_from_facet_levels(
fn facet_numbers_distribution_from_facet_levels(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
let iter = match facet_type {
FacetType::String => unreachable!(),
FacetType::Number => {
let iter = FacetIter::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))
},
};
let iter = FacetIter::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, mut docids) = result?;
docids.intersect_with(candidates);
if !docids.is_empty() {
facet_values.insert(value, docids.len());
distribution.insert(value.to_string(), docids.len());
}
if facet_values.len() == self.max_values_by_facet {
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
Ok(())
}
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
@ -147,80 +142,73 @@ impl<'a> FacetDistribution<'a> {
fn facet_values_from_raw_facet_database(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
) -> heed::Result<BTreeMap<String, u64>>
{
let db = self.index.facet_field_id_value_docids;
let level = 0;
let iter = match facet_type {
FacetType::String => {
let iter = db
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Number => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
let range = FacetRange::new(
self.rtxn, db, field_id, level, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
},
};
let mut distribution = BTreeMap::new();
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, docids) = result?;
facet_values.insert(value, docids.len());
if facet_values.len() == self.max_values_by_facet {
let db = self.index.facet_id_f64_docids;
let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
for result in range {
let ((_, _, value, _), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
let iter = self.index
.facet_id_string_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>();
for result in iter {
let ((_, value), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(distribution)
}
fn facet_values(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
use FacetType::{Number, String};
if let Some(candidates) = self.candidates.as_ref() {
// Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons.
if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String {
self.facet_values_from_documents(field_id, facet_type, candidates)
let mut distribution = BTreeMap::new();
if candidates.len() <= CANDIDATES_THRESHOLD {
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
} else {
self.facet_values_from_facet_levels(field_id, facet_type, candidates)
self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
}
Ok(distribution)
} else {
self.facet_values_from_raw_facet_database(field_id, facet_type)
self.facet_values_from_raw_facet_database(field_id)
}
}
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<FacetValue, u64>>> {
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<String, u64>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
let fields_ids: Vec<_> = match &self.facets {
Some(names) => names
.iter()
.filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t)))
.collect(),
None => faceted_fields.into_iter().collect(),
};
let mut facets_values = BTreeMap::new();
for (name, ftype) in fields_ids {
let mut distribution = BTreeMap::new();
for name in faceted_fields {
let fid = fields_ids_map.id(&name).with_context(|| {
format!("missing field name {:?} from the fields id map", name)
})?;
let values = self.facet_values(fid, ftype)?;
facets_values.insert(name, values);
let values = self.facet_values(fid)?;
distribution.insert(name, values);
}
Ok(facets_values)
Ok(distribution)
}
}

View file

@ -9,7 +9,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{Index, FieldId};
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator};
pub use self::facet_condition::{FacetCondition, Operator};
pub use self::facet_distribution::FacetDistribution;
mod facet_condition;
@ -140,7 +140,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];
@ -157,7 +157,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Right(highest_iter))];
@ -175,7 +175,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];

View file

@ -16,9 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId};
pub use self::facet::{
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
};
pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator};
pub use self::query_tree::MatchingWords;
use self::query_tree::QueryTreeBuilder;
@ -143,15 +141,12 @@ impl<'a> Search<'a> {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
let id = field_ids_map.id(name).expect("distinct not present in field map");
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
match faceted_fields.get(name) {
Some(facet_type) => {
let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type);
self.perform_sort(distinct, matching_words, criteria)
}
None => {
let distinct = MapDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
}
if faceted_fields.contains(name) {
let distinct = FacetDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
} else {
let distinct = MapDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
}
}
}