fix facet distribution case

This commit is contained in:
mpostma 2020-06-26 12:59:20 +02:00
parent fb69769991
commit 18a50b4dac
4 changed files with 67 additions and 30 deletions

View File

@ -39,7 +39,7 @@ pub fn bucket_sort<'c, FI>(
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
facets_docids: Option<SetBuf<DocumentId>>, facets_docids: Option<SetBuf<DocumentId>>,
facet_count_docids: Option<HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>>, facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
filter: Option<FI>, filter: Option<FI>,
criteria: Criteria<'c>, criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>, searchable_attrs: Option<ReorderedAttrs>,
@ -199,7 +199,7 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>(
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
facets_docids: Option<SetBuf<DocumentId>>, facets_docids: Option<SetBuf<DocumentId>>,
facet_count_docids: Option<HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>>, facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
filter: Option<FI>, filter: Option<FI>,
distinct: FD, distinct: FD,
distinct_size: usize, distinct_size: usize,
@ -636,18 +636,18 @@ pub fn placeholder_document_sort(
} }
/// For each entry in facet_docids, calculates the number of documents in the intersection with candidate_docids. /// For each entry in facet_docids, calculates the number of documents in the intersection with candidate_docids.
pub fn facet_count( fn facet_count(
facet_docids: HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>, facet_docids: HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>,
candidate_docids: &Set<DocumentId>, candidate_docids: &Set<DocumentId>,
) -> HashMap<String, HashMap<String, usize>> { ) -> HashMap<String, HashMap<String, usize>> {
let mut facets_counts = HashMap::with_capacity(facet_docids.len()); let mut facets_counts = HashMap::with_capacity(facet_docids.len());
for (key, doc_map) in facet_docids { for (key, doc_map) in facet_docids {
let mut count_map = HashMap::with_capacity(doc_map.len()); let mut count_map = HashMap::with_capacity(doc_map.len());
for (value, docids) in doc_map { for (_, (value, docids)) in doc_map {
let mut counter = Counter::new(); let mut counter = Counter::new();
let op = OpBuilder::new(docids.as_ref(), candidate_docids).intersection(); let op = OpBuilder::new(docids.as_ref(), candidate_docids).intersection();
SetOperation::<DocumentId>::extend_collection(op, &mut counter); SetOperation::<DocumentId>::extend_collection(op, &mut counter);
count_map.insert(value, counter.0); count_map.insert(value.to_string(), counter.0);
} }
facets_counts.insert(key, count_map); facets_counts.insert(key, count_map);
} }

View File

@ -164,7 +164,7 @@ impl<'a> heed::BytesDecode<'a> for FacetKey {
} }
pub fn add_to_facet_map( pub fn add_to_facet_map(
facet_map: &mut HashMap<FacetKey, Vec<DocumentId>>, facet_map: &mut HashMap<FacetKey, (String, Vec<DocumentId>)>,
field_id: FieldId, field_id: FieldId,
value: Value, value: Value,
document_id: DocumentId, document_id: DocumentId,
@ -175,8 +175,8 @@ pub fn add_to_facet_map(
Value::Null => return Ok(()), Value::Null => return Ok(()),
value => return Err(FacetError::InvalidDocumentAttribute(value.to_string())), value => return Err(FacetError::InvalidDocumentAttribute(value.to_string())),
}; };
let key = FacetKey::new(field_id, value); let key = FacetKey::new(field_id, value.clone());
facet_map.entry(key).or_insert_with(Vec::new).push(document_id); facet_map.entry(key).or_insert_with(|| (value, Vec::new())).1.push(document_id);
Ok(()) Ok(())
} }
@ -185,8 +185,10 @@ pub fn facet_map_from_docids(
index: &crate::Index, index: &crate::Index,
document_ids: &[DocumentId], document_ids: &[DocumentId],
attributes_for_facetting: &[FieldId], attributes_for_facetting: &[FieldId],
) -> MResult<HashMap<FacetKey, Vec<DocumentId>>> { ) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
let mut facet_map = HashMap::new(); // A hashmap that ascociate a facet key to a pair containing the original facet attribute
// string with it's case preserved, and a list of document ids for that facet attribute.
let mut facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)> = HashMap::new();
for document_id in document_ids { for document_id in document_ids {
for result in index for result in index
.documents_fields .documents_fields
@ -212,7 +214,7 @@ pub fn facet_map_from_docs(
schema: &Schema, schema: &Schema,
documents: &HashMap<DocumentId, IndexMap<String, Value>>, documents: &HashMap<DocumentId, IndexMap<String, Value>>,
attributes_for_facetting: &[FieldId], attributes_for_facetting: &[FieldId],
) -> MResult<HashMap<FacetKey, Vec<DocumentId>>> { ) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
let mut facet_map = HashMap::new(); let mut facet_map = HashMap::new();
let attributes_for_facetting = attributes_for_facetting let attributes_for_facetting = attributes_for_facetting
.iter() .iter()

View File

@ -97,16 +97,14 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> {
.unwrap_or_default(); .unwrap_or_default();
ors.push(docids); ors.push(docids);
} }
let sets: Vec<_> = ors.iter().map(Cow::deref).collect(); let sets: Vec<_> = ors.iter().map(|i| &i.1).map(Cow::deref).collect();
let or_result = sdset::multi::OpBuilder::from_vec(sets) let or_result = sdset::multi::OpBuilder::from_vec(sets).union().into_set_buf();
.union()
.into_set_buf();
ands.push(Cow::Owned(or_result)); ands.push(Cow::Owned(or_result));
ors.clear(); ors.clear();
} }
Either::Right(key) => { Either::Right(key) => {
match self.index.facets.facet_document_ids(reader, &key)? { match self.index.facets.facet_document_ids(reader, &key)? {
Some(docids) => ands.push(docids), Some(docids) => ands.push(docids.1),
// no candidates for search, early return. // no candidates for search, early return.
None => return Ok(Some(SetBuf::default())), None => return Ok(Some(SetBuf::default())),
} }

View File

@ -1,7 +1,8 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::mem;
use heed::{RwTxn, RoTxn, Result as ZResult, RoRange}; use heed::{RwTxn, RoTxn, Result as ZResult, RoRange, types::Str, BytesEncode, BytesDecode};
use sdset::{SetBuf, Set, SetOperation}; use sdset::{SetBuf, Set, SetOperation};
use meilisearch_types::DocumentId; use meilisearch_types::DocumentId;
@ -14,40 +15,76 @@ use super::cow_set::CowSet;
/// contains facet info /// contains facet info
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
pub struct Facets { pub struct Facets {
pub(crate) facets: heed::Database<FacetKey, CowSet<DocumentId>>, pub(crate) facets: heed::Database<FacetKey, FacetData>,
}
pub struct FacetData;
impl<'a> BytesEncode<'a> for FacetData {
type EItem = (&'a str, &'a Set<DocumentId>);
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
// get size of the first item
let first_size = item.0.as_bytes().len();
let size = mem::size_of::<usize>()
+ first_size
+ item.1.len() * mem::size_of::<DocumentId>();
let mut buffer = Vec::with_capacity(size);
// encode the length of the first item
buffer.extend_from_slice(&first_size.to_be_bytes());
buffer.extend_from_slice(Str::bytes_encode(&item.0)?.as_ref());
let second_slice = CowSet::bytes_encode(&item.1)?;
buffer.extend_from_slice(second_slice.as_ref());
Some(Cow::Owned(buffer))
}
}
impl<'a> BytesDecode<'a> for FacetData {
type DItem = (&'a str, Cow<'a, Set<DocumentId>>);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let mut size_buf = [0; 8];
size_buf.copy_from_slice(bytes.get(0..8)?);
// decode size of the first item from the bytes
let first_size = usize::from_be_bytes(size_buf);
// decode first and second items
let first_item = Str::bytes_decode(bytes.get(8..(8 + first_size))?)?;
let second_item = CowSet::bytes_decode(bytes.get((8 + first_size)..)?)?;
Some((first_item, second_item))
}
} }
impl Facets { impl Facets {
// we use sdset::SetBuf to ensure the docids are sorted. // we use sdset::SetBuf to ensure the docids are sorted.
pub fn put_facet_document_ids(&self, writer: &mut RwTxn<MainT>, facet_key: FacetKey, doc_ids: &Set<DocumentId>) -> ZResult<()> { pub fn put_facet_document_ids(&self, writer: &mut RwTxn<MainT>, facet_key: FacetKey, doc_ids: &Set<DocumentId>, facet_value: &str) -> ZResult<()> {
self.facets.put(writer, &facet_key, doc_ids) self.facets.put(writer, &facet_key, &(facet_value, doc_ids))
} }
pub fn field_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, field_id: FieldId) -> ZResult<RoRange<'txn, FacetKey, CowSet<DocumentId>>> { pub fn field_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, field_id: FieldId) -> ZResult<RoRange<'txn, FacetKey, FacetData>> {
self.facets.prefix_iter(reader, &FacetKey::new(field_id, String::new())) self.facets.prefix_iter(reader, &FacetKey::new(field_id, String::new()))
} }
pub fn facet_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, facet_key: &FacetKey) -> ZResult<Option<Cow<'txn, Set<DocumentId>>>> { pub fn facet_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, facet_key: &FacetKey) -> ZResult<Option<(&'txn str,Cow<'txn, Set<DocumentId>>)>> {
self.facets.get(reader, &facet_key) self.facets.get(reader, &facet_key)
} }
/// updates the facets store, revmoving the documents from the facets provided in the /// updates the facets store, revmoving the documents from the facets provided in the
/// `facet_map` argument /// `facet_map` argument
pub fn remove(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, Vec<DocumentId>>) -> ZResult<()> { pub fn remove(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> ZResult<()> {
for (key, document_ids) in facet_map { for (key, (name, document_ids)) in facet_map {
if let Some(old) = self.facets.get(writer, &key)? { if let Some((_, old)) = self.facets.get(writer, &key)? {
let to_remove = SetBuf::from_dirty(document_ids); let to_remove = SetBuf::from_dirty(document_ids);
let new = sdset::duo::OpBuilder::new(old.as_ref(), to_remove.as_set()).difference().into_set_buf(); let new = sdset::duo::OpBuilder::new(old.as_ref(), to_remove.as_set()).difference().into_set_buf();
self.facets.put(writer, &key, new.as_set())?; self.facets.put(writer, &key, &(&name, new.as_set()))?;
} }
} }
Ok(()) Ok(())
} }
pub fn add(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, Vec<DocumentId>>) -> ZResult<()> { pub fn add(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> ZResult<()> {
for (key, document_ids) in facet_map { for (key, (facet_name, document_ids)) in facet_map {
let set = SetBuf::from_dirty(document_ids); let set = SetBuf::from_dirty(document_ids);
self.put_facet_document_ids(writer, key, set.as_set())?; self.put_facet_document_ids(writer, key, set.as_set(), &facet_name)?;
} }
Ok(()) Ok(())
} }