diff --git a/milli/src/index.rs b/milli/src/index.rs index 59f966b95..a2b6cc440 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -343,6 +343,20 @@ impl Index { } } + /* Distinct attribute */ + + pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + } + + pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + } + + pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + } + /* criteria */ pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { @@ -462,17 +476,6 @@ impl Index { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } - pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) - } - - pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) - } - - pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) - } } #[cfg(test)] diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index cecb8ba4b..053bbd705 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,12 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; +/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// distinct attribute of the first document are retrieved. The document ids for these facet values +/// are then retrieved and taken out of the the candidate and added to the excluded set. We take +/// care to keep the document we are currently on, and remove it from the excluded list. The next +/// iterations will never contain any occurence of a document with the same distinct value as a +/// document from previous iterations. pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, @@ -114,6 +120,9 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } + /// Performs the next iteration of the facet distinct. This is a convenience method that is + /// called by the Iterator::next implementation that tranposes the result. It makes error + /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates self.candidates.difference_with(&self.excluded); @@ -127,8 +136,10 @@ impl<'a> FacetDistinctIter<'a> { FacetType::Float => self.distinct_float(id)?, }; - // On every iteration, the first document is always a distinct one, since it - // hasn't been discarded by the previous difference. + // The first document of each iteration is kept, since the next call to + // `difference_with` will filter out all the documents for that facet value. By + // increasing the offset we make sure to get the first valid value for the next + // distinct document to keep. self.iter_offset += 1; Ok(Some(id)) } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 411d63c87..37e52aa6f 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,6 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; +/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// for its distinct field is added to the map. If the map already contains an entry for this +/// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { distinct: FieldId, map: HashMap, diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 0f24e2d03..5f2e260e4 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -9,11 +9,17 @@ pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; use crate::DocumentId; +/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. +/// It provides a way to get back the ownership to the excluded set. pub trait DocIter: Iterator> { - /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) + /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } +/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct +/// must return an iterator containing only distinct documents, and add the discarded documents to +/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the +/// returned iterator. pub trait Distinct<'a> { type Iter: DocIter; diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 6484f4d64..9fdf17187 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,12 +1,14 @@ -use roaring::RoaringBitmap; +use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; +/// A distinct implementer that does not perform any distinct, and simply returns an iterator to +/// the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { - candidates: roaring::bitmap::IntoIter, + candidates: IntoIter, excluded: RoaringBitmap, }