fix facet distribution case

2025-07-03 11:57:07 +02:00 · 2020-06-26 12:59:20 +02:00 · 2020-06-26 12:59:20 +02:00 · 18a50b4dac
commit 18a50b4dac
parent fb69769991
4 changed files with 67 additions and 30 deletions
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -39,7 +39,7 @@ pub fn bucket_sort<'c, FI>(
    query: &str,
    range: Range<usize>,
    facets_docids: Option<SetBuf<DocumentId>>,
-    facet_count_docids: Option<HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>>,
+    facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
    filter: Option<FI>,
    criteria: Criteria<'c>,
    searchable_attrs: Option<ReorderedAttrs>,
@ -199,7 +199,7 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>(
    query: &str,
    range: Range<usize>,
    facets_docids: Option<SetBuf<DocumentId>>,
-    facet_count_docids: Option<HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>>,
+    facet_count_docids: Option<HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>>,
    filter: Option<FI>,
    distinct: FD,
    distinct_size: usize,
@ -636,18 +636,18 @@ pub fn placeholder_document_sort(
 }

 /// For each entry in facet_docids, calculates the number of documents in the intersection with candidate_docids.
-pub fn facet_count(
-    facet_docids: HashMap<String, HashMap<String, Cow<Set<DocumentId>>>>,
+fn facet_count(
+    facet_docids: HashMap<String, HashMap<String, (&str, Cow<Set<DocumentId>>)>>,
    candidate_docids: &Set<DocumentId>,
 ) -> HashMap<String, HashMap<String, usize>> {
    let mut facets_counts = HashMap::with_capacity(facet_docids.len());
    for (key, doc_map) in facet_docids {
        let mut count_map = HashMap::with_capacity(doc_map.len());
-        for (value, docids) in doc_map {
+        for (_, (value, docids)) in doc_map {
            let mut counter = Counter::new();
            let op = OpBuilder::new(docids.as_ref(), candidate_docids).intersection();
            SetOperation::<DocumentId>::extend_collection(op, &mut counter);
-            count_map.insert(value, counter.0);
+            count_map.insert(value.to_string(), counter.0);
        }
        facets_counts.insert(key, count_map);
    }
--- a/meilisearch-core/src/facets.rs
+++ b/meilisearch-core/src/facets.rs
@ -164,7 +164,7 @@ impl<'a> heed::BytesDecode<'a> for FacetKey {
 }

 pub fn add_to_facet_map(
-    facet_map: &mut HashMap<FacetKey, Vec<DocumentId>>,
+    facet_map: &mut HashMap<FacetKey, (String, Vec<DocumentId>)>,
    field_id: FieldId,
    value: Value,
    document_id: DocumentId,
@ -175,8 +175,8 @@ pub fn add_to_facet_map(
        Value::Null => return Ok(()),
        value => return Err(FacetError::InvalidDocumentAttribute(value.to_string())),
    };
-    let key = FacetKey::new(field_id, value);
-    facet_map.entry(key).or_insert_with(Vec::new).push(document_id);
+    let key = FacetKey::new(field_id, value.clone());
+    facet_map.entry(key).or_insert_with(|| (value, Vec::new())).1.push(document_id);
    Ok(())
 }

@ -185,8 +185,10 @@ pub fn facet_map_from_docids(
    index: &crate::Index,
    document_ids: &[DocumentId],
    attributes_for_facetting: &[FieldId],
-) -> MResult<HashMap<FacetKey, Vec<DocumentId>>> {
-    let mut facet_map = HashMap::new();
+) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
+    // A hashmap that ascociate a facet key to a pair containing the original facet attribute
+    // string with it's case preserved, and a list of document ids for that facet attribute.
+    let mut facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)> = HashMap::new();
    for document_id in document_ids {
        for result in index
            .documents_fields
@ -212,7 +214,7 @@ pub fn facet_map_from_docs(
    schema: &Schema,
    documents: &HashMap<DocumentId, IndexMap<String, Value>>,
    attributes_for_facetting: &[FieldId],
-) -> MResult<HashMap<FacetKey, Vec<DocumentId>>> {
+) -> MResult<HashMap<FacetKey, (String, Vec<DocumentId>)>> {
    let mut facet_map = HashMap::new();
    let attributes_for_facetting = attributes_for_facetting
        .iter()
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -97,16 +97,14 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> {
                                    .unwrap_or_default();
                                ors.push(docids);
                            }
-                            let sets: Vec<_> = ors.iter().map(Cow::deref).collect();
-                            let or_result = sdset::multi::OpBuilder::from_vec(sets)
-                                .union()
-                                .into_set_buf();
+                            let sets: Vec<_> = ors.iter().map(|i| &i.1).map(Cow::deref).collect();
+                            let or_result = sdset::multi::OpBuilder::from_vec(sets).union().into_set_buf();
                            ands.push(Cow::Owned(or_result));
                            ors.clear();
                        }
                        Either::Right(key) => {
                            match self.index.facets.facet_document_ids(reader, &key)? {
-                                Some(docids) => ands.push(docids),
+                                Some(docids) => ands.push(docids.1),
                                // no candidates for search, early return.
                                None => return Ok(Some(SetBuf::default())),
                            }
--- a/meilisearch-core/src/store/facets.rs
+++ b/meilisearch-core/src/store/facets.rs
@ -1,7 +1,8 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
+use std::mem;

-use heed::{RwTxn, RoTxn, Result as ZResult, RoRange};
+use heed::{RwTxn, RoTxn, Result as ZResult, RoRange, types::Str, BytesEncode, BytesDecode};
 use sdset::{SetBuf, Set, SetOperation};

 use meilisearch_types::DocumentId;
@ -14,40 +15,76 @@ use super::cow_set::CowSet;
 /// contains facet info
 #[derive(Clone, Copy)]
 pub struct Facets {
-    pub(crate) facets: heed::Database<FacetKey, CowSet<DocumentId>>,
+    pub(crate) facets: heed::Database<FacetKey, FacetData>,
+}
+
+pub struct FacetData;
+
+impl<'a> BytesEncode<'a> for FacetData {
+    type EItem = (&'a str, &'a Set<DocumentId>);
+
+    fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
+        // get size of the first item
+        let first_size =  item.0.as_bytes().len();
+        let size = mem::size_of::<usize>()
+            + first_size
+            + item.1.len() * mem::size_of::<DocumentId>();
+        let mut buffer = Vec::with_capacity(size);
+        // encode the length of the first item
+        buffer.extend_from_slice(&first_size.to_be_bytes());
+        buffer.extend_from_slice(Str::bytes_encode(&item.0)?.as_ref());
+        let second_slice = CowSet::bytes_encode(&item.1)?;
+        buffer.extend_from_slice(second_slice.as_ref());
+        Some(Cow::Owned(buffer))
+    }
+}
+
+impl<'a> BytesDecode<'a> for FacetData {
+    type DItem = (&'a str, Cow<'a, Set<DocumentId>>);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let mut size_buf = [0; 8];
+        size_buf.copy_from_slice(bytes.get(0..8)?);
+        // decode size of the first item from the bytes
+        let first_size = usize::from_be_bytes(size_buf);
+        // decode first and second items
+        let first_item = Str::bytes_decode(bytes.get(8..(8 + first_size))?)?;
+        let second_item = CowSet::bytes_decode(bytes.get((8 + first_size)..)?)?;
+        Some((first_item, second_item))
+    }
 }

 impl Facets {
    // we use sdset::SetBuf to ensure the docids are sorted.
-    pub fn put_facet_document_ids(&self, writer: &mut RwTxn<MainT>, facet_key: FacetKey, doc_ids: &Set<DocumentId>) -> ZResult<()> {
-        self.facets.put(writer, &facet_key, doc_ids)
+    pub fn put_facet_document_ids(&self, writer: &mut RwTxn<MainT>, facet_key: FacetKey, doc_ids: &Set<DocumentId>, facet_value: &str) -> ZResult<()> {
+        self.facets.put(writer, &facet_key, &(facet_value, doc_ids))
    }

-    pub fn field_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, field_id: FieldId) -> ZResult<RoRange<'txn, FacetKey, CowSet<DocumentId>>> {
+    pub fn field_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, field_id: FieldId) -> ZResult<RoRange<'txn, FacetKey, FacetData>> {
        self.facets.prefix_iter(reader, &FacetKey::new(field_id, String::new()))
    }

-    pub fn facet_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, facet_key: &FacetKey) -> ZResult<Option<Cow<'txn, Set<DocumentId>>>> {
+    pub fn facet_document_ids<'txn>(&self, reader: &'txn RoTxn<MainT>, facet_key: &FacetKey) -> ZResult<Option<(&'txn str,Cow<'txn, Set<DocumentId>>)>> {
        self.facets.get(reader, &facet_key)
    }

    /// updates the facets  store, revmoving the documents from the facets provided in the
    /// `facet_map` argument
-    pub fn remove(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, Vec<DocumentId>>) -> ZResult<()> {
-        for (key, document_ids) in facet_map {
-            if let Some(old) = self.facets.get(writer, &key)? {
+    pub fn remove(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> ZResult<()> {
+        for (key, (name, document_ids)) in facet_map {
+            if let Some((_, old)) = self.facets.get(writer, &key)? {
                let to_remove = SetBuf::from_dirty(document_ids);
                let new = sdset::duo::OpBuilder::new(old.as_ref(), to_remove.as_set()).difference().into_set_buf();
-                self.facets.put(writer, &key, new.as_set())?;
+                self.facets.put(writer, &key, &(&name, new.as_set()))?;
            }
        }
        Ok(())
    }

-    pub fn add(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, Vec<DocumentId>>) -> ZResult<()> {
-        for (key, document_ids) in facet_map {
+    pub fn add(&self, writer: &mut RwTxn<MainT>, facet_map: HashMap<FacetKey, (String, Vec<DocumentId>)>) -> ZResult<()> {
+        for (key, (facet_name, document_ids)) in facet_map {
            let set = SetBuf::from_dirty(document_ids);
-            self.put_facet_document_ids(writer, key, set.as_set())?;
+            self.put_facet_document_ids(writer, key, set.as_set(), &facet_name)?;
        }
        Ok(())
    }