diff --git a/milli/src/index.rs b/milli/src/index.rs
index 643a9ffb9..7be618789 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -478,13 +478,44 @@ impl Index {
}
#[cfg(test)]
-mod tests {
+pub(crate) mod tests {
+ use std::ops::Deref;
+
use heed::EnvOpenOptions;
use maplit::hashmap;
+ use tempfile::TempDir;
use crate::Index;
use crate::update::{IndexDocuments, UpdateFormat};
+ pub(crate) struct TempIndex {
+ inner: Index,
+ _tempdir: TempDir,
+ }
+
+ impl Deref for TempIndex {
+ type Target = Index;
+
+ fn deref(&self) -> &Self::Target {
+ &self.inner
+ }
+ }
+
+ impl TempIndex {
+ /// Creates a temporary index, with a default `4096 * 100` size. This should be enough for
+ /// most tests.
+ pub fn new() -> Self {
+ let mut options = EnvOpenOptions::new();
+ options.map_size(100 * 4096);
+ let _tempdir = TempDir::new_in(".").unwrap();
+ let inner = Index::new(options, _tempdir.path()).unwrap();
+ Self {
+ inner,
+ _tempdir
+ }
+ }
+ }
+
#[test]
fn initial_fields_distribution() {
let path = tempfile::tempdir().unwrap();
diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs
index 053bbd705..e97f8b922 100644
--- a/milli/src/search/distinct/facet_distinct.rs
+++ b/milli/src/search/distinct/facet_distinct.rs
@@ -6,7 +6,9 @@ use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter};
-/// A distinct implementer that is backed by facets. On each iteration, the facet values for the
+/// A distinct implementer that is backed by facets.
+///
+/// On each iteration, the facet values for the
/// distinct attribute of the first document are retrieved. The document ids for these facet values
/// are then retrieved and taken out of the the candidate and added to the excluded set. We take
/// care to keep the document we are currently on, and remove it from the excluded list. The next
@@ -121,7 +123,7 @@ impl<'a> FacetDistinctIter<'a> {
}
/// Performs the next iteration of the facet distinct. This is a convenience method that is
- /// called by the Iterator::next implementation that tranposes the result. It makes error
+ /// called by the Iterator::next implementation that transposes the result. It makes error
/// handling easier.
fn next_inner(&mut self) -> anyhow::Result> {
// The first step is to remove all the excluded documents from our candidates
@@ -201,3 +203,36 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
}
}
}
+
+#[cfg(test)]
+mod test {
+ use std::collections::HashMap;
+
+ use super::*;
+ use super::super::test::{generate_index, validate_distinct_candidates};
+ use crate::facet::FacetType;
+
+ macro_rules! test_facet_distinct {
+ ($name:ident, $distinct:literal, $facet_type:expr) => {
+ #[test]
+ fn $name() {
+ use std::iter::FromIterator;
+
+ let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string())));
+ let (index, fid, candidates) = generate_index($distinct, facets);
+ let txn = index.read_txn().unwrap();
+ let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);
+ let excluded = RoaringBitmap::new();
+ let mut iter = map_distinct.distinct(candidates.clone(), excluded);
+ let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
+ let excluded = iter.into_excluded();
+ assert_eq!(count as u64 + excluded.len(), candidates.len());
+ }
+ };
+ }
+
+ test_facet_distinct!(test_string, "txt", FacetType::String);
+ test_facet_distinct!(test_strings, "txts", FacetType::String);
+ test_facet_distinct!(test_int, "cat-int", FacetType::Integer);
+ test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer);
+}
diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs
index f2e31bce4..4c01d1ded 100644
--- a/milli/src/search/distinct/map_distinct.rs
+++ b/milli/src/search/distinct/map_distinct.rs
@@ -6,7 +6,9 @@ use serde_json::Value;
use super::{Distinct, DocIter};
use crate::{DocumentId, FieldId, Index};
-/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value
+/// A distinct implementer that is backed by an `HashMap`.
+///
+/// Each time a document is seen, the value
/// for its distinct field is added to the map. If the map already contains an entry for this
/// value, then the document is filtered out, and is added to the excluded set.
pub struct MapDistinct<'a> {
@@ -38,7 +40,7 @@ pub struct MapDistinctIter<'a, 'b> {
impl<'a, 'b> MapDistinctIter<'a, 'b> {
/// Performs the next iteration of the mafacetp distinct. This is a convenience method that is
- /// called by the Iterator::next implementation that tranposes the result. It makes error
+ /// called by the Iterator::next implementation that transposes the result. It makes error
/// handling easier.
fn next_inner(&mut self) -> anyhow::Result > {
let map = &mut self.map;
@@ -105,3 +107,32 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
}
}
}
+
+#[cfg(test)]
+mod test {
+ use std::collections::HashMap;
+
+ use super::*;
+ use super::super::test::{generate_index, validate_distinct_candidates};
+
+ macro_rules! test_map_distinct {
+ ($name:ident, $distinct:literal) => {
+ #[test]
+ fn $name() {
+ let (index, fid, candidates) = generate_index($distinct, HashMap::new());
+ let txn = index.read_txn().unwrap();
+ let mut map_distinct = MapDistinct::new(fid, &index, &txn);
+ let excluded = RoaringBitmap::new();
+ let mut iter = map_distinct.distinct(candidates.clone(), excluded);
+ let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
+ let excluded = iter.into_excluded();
+ assert_eq!(count as u64 + excluded.len(), candidates.len());
+ }
+ };
+ }
+
+ test_map_distinct!(test_string, "txt");
+ test_map_distinct!(test_strings, "txts");
+ test_map_distinct!(test_int, "cat-int");
+ test_map_distinct!(test_ints, "cat-ints");
+}
diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs
index 5f2e260e4..776f0d2b3 100644
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@@ -4,14 +4,14 @@ mod noop_distinct;
use roaring::RoaringBitmap;
+use crate::DocumentId;
pub use facet_distinct::FacetDistinct;
pub use map_distinct::MapDistinct;
pub use noop_distinct::NoopDistinct;
-use crate::DocumentId;
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
/// It provides a way to get back the ownership to the excluded set.
-pub trait DocIter: Iterator- > {
+pub trait DocIter: Iterator
- > {
/// Returns ownership on the internal exluded set.
fn into_excluded(self) -> RoaringBitmap;
}
@@ -25,3 +25,120 @@ pub trait Distinct<'a> {
fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
}
+
+#[cfg(test)]
+mod test {
+ use std::collections::{HashMap, HashSet};
+
+ use once_cell::sync::Lazy;
+ use rand::{seq::SliceRandom, Rng};
+ use roaring::RoaringBitmap;
+ use serde_json::{json, Value};
+
+ use crate::index::{Index, tests::TempIndex};
+ use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
+ use crate::{BEU32, FieldId, DocumentId};
+
+ static JSON: Lazy
= Lazy::new(generate_json);
+
+ fn generate_json() -> Value {
+ let mut rng = rand::thread_rng();
+ let num_docs = rng.gen_range(10..30);
+
+ let mut documents = Vec::new();
+
+ let txts = ["toto", "titi", "tata"];
+ let cats = (1..10).map(|i| i.to_string()).collect::>();
+ let cat_ints = (1..10).collect::>();
+
+ for i in 0..num_docs {
+ let txt = txts.choose(&mut rng).unwrap();
+ let mut sample_txts = cats.clone();
+ sample_txts.shuffle(&mut rng);
+
+ let mut sample_ints = cat_ints.clone();
+ sample_ints.shuffle(&mut rng);
+
+ let doc = json!({
+ "id": i,
+ "txt": txt,
+ "cat-int": rng.gen_range(0..3),
+ "txts": sample_txts[..(rng.gen_range(0..3))],
+ "cat-ints": sample_ints[..(rng.gen_range(0..3))],
+ });
+ documents.push(doc);
+ }
+
+ Value::Array(documents)
+ }
+
+ /// Returns a temporary index populated with random test documents, the FieldId for the
+ /// distinct attribute, and the RoaringBitmap with the document ids.
+ pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) {
+ let index = TempIndex::new();
+ let mut txn = index.write_txn().unwrap();
+
+ // set distinct and faceted attributes for the index.
+ let builder = UpdateBuilder::new(0);
+ let mut update = builder.settings(&mut txn, &index);
+ update.set_distinct_attribute(distinct.to_string());
+ if !facets.is_empty() {
+ update.set_faceted_fields(facets)
+ }
+ update.execute(|_, _| ()).unwrap();
+
+ // add documents to the index
+ let builder = UpdateBuilder::new(1);
+ let mut addition = builder.index_documents(&mut txn, &index);
+
+ addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+ addition.update_format(UpdateFormat::Json);
+
+ addition
+ .execute(JSON.to_string().as_bytes(), |_, _| ())
+ .unwrap();
+
+ let fields_map = index.fields_ids_map(&txn).unwrap();
+ let fid = fields_map.id(&distinct).unwrap();
+
+ let map = (0..JSON.as_array().unwrap().len() as u32).collect();
+
+ txn.commit().unwrap();
+
+ (index, fid, map)
+ }
+
+
+ /// Checks that all the candidates are distinct, and returns the candidates number.
+ pub(crate) fn validate_distinct_candidates(
+ candidates: impl Iterator- >,
+ distinct: FieldId,
+ index: &Index,
+ ) -> usize {
+ fn test(seen: &mut HashSet
, value: &Value) {
+ match value {
+ Value::Null | Value::Object(_) | Value::Bool(_) => (),
+ Value::Number(_) | Value::String(_) => {
+ let s = value.to_string();
+ assert!(seen.insert(s));
+ }
+ Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))}
+ }
+ }
+
+ let mut seen = HashSet::::new();
+
+ let txn = index.read_txn().unwrap();
+ let mut count = 0;
+ for candidate in candidates {
+ count += 1;
+ let candidate = candidate.unwrap();
+ let id = BEU32::new(candidate);
+ let document = index.documents.get(&txn, &id).unwrap().unwrap();
+ let value = document.get(distinct).unwrap();
+ let value = serde_json::from_slice(value).unwrap();
+ test(&mut seen, &value);
+ }
+ count
+ }
+}
diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs
index 8f7bc7d17..3de9be631 100644
--- a/milli/src/search/distinct/noop_distinct.rs
+++ b/milli/src/search/distinct/noop_distinct.rs
@@ -3,8 +3,8 @@ use roaring::{RoaringBitmap, bitmap::IntoIter};
use crate::DocumentId;
use super::{DocIter, Distinct};
-/// A distinct implementer that does not perform any distinct, and simply returns an iterator to
-/// the candidates.
+/// A distinct implementer that does not perform any distinct,
+/// and simply returns an iterator to the candidates.
pub struct NoopDistinct;
pub struct NoopDistinctIter {
@@ -36,3 +36,22 @@ impl Distinct<'_> for NoopDistinct {
}
}
}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_noop() {
+ let candidates = (1..10).collect();
+ let excluded = RoaringBitmap::new();
+ let mut iter = NoopDistinct.distinct(candidates, excluded);
+ assert_eq!(
+ iter.by_ref().map(Result::unwrap).collect::>(),
+ (1..10).collect::>()
+ );
+
+ let excluded = iter.into_excluded();
+ assert!(excluded.is_empty());
+ }
+}