diff --git a/milli/src/index.rs b/milli/src/index.rs index 643a9ffb9..7be618789 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -478,13 +478,44 @@ impl Index { } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::ops::Deref; + use heed::EnvOpenOptions; use maplit::hashmap; + use tempfile::TempDir; use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; + pub(crate) struct TempIndex { + inner: Index, + _tempdir: TempDir, + } + + impl Deref for TempIndex { + type Target = Index; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + + impl TempIndex { + /// Creates a temporary index, with a default `4096 * 100` size. This should be enough for + /// most tests. + pub fn new() -> Self { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 4096); + let _tempdir = TempDir::new_in(".").unwrap(); + let inner = Index::new(options, _tempdir.path()).unwrap(); + Self { + inner, + _tempdir + } + } + } + #[test] fn initial_fields_distribution() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 053bbd705..e97f8b922 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,9 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; -/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// A distinct implementer that is backed by facets. +/// +/// On each iteration, the facet values for the /// distinct attribute of the first document are retrieved. The document ids for these facet values /// are then retrieved and taken out of the the candidate and added to the excluded set. We take /// care to keep the document we are currently on, and remove it from the excluded list. The next @@ -121,7 +123,7 @@ impl<'a> FacetDistinctIter<'a> { } /// Performs the next iteration of the facet distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates @@ -201,3 +203,36 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + use crate::facet::FacetType; + + macro_rules! test_facet_distinct { + ($name:ident, $distinct:literal, $facet_type:expr) => { + #[test] + fn $name() { + use std::iter::FromIterator; + + let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let (index, fid, candidates) = generate_index($distinct, facets); + let txn = index.read_txn().unwrap(); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_facet_distinct!(test_string, "txt", FacetType::String); + test_facet_distinct!(test_strings, "txts", FacetType::String); + test_facet_distinct!(test_int, "cat-int", FacetType::Integer); + test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer); +} diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index f2e31bce4..4c01d1ded 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,7 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; -/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// A distinct implementer that is backed by an `HashMap`. +/// +/// Each time a document is seen, the value /// for its distinct field is added to the map. If the map already contains an entry for this /// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { @@ -38,7 +40,7 @@ pub struct MapDistinctIter<'a, 'b> { impl<'a, 'b> MapDistinctIter<'a, 'b> { /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { let map = &mut self.map; @@ -105,3 +107,32 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + + macro_rules! test_map_distinct { + ($name:ident, $distinct:literal) => { + #[test] + fn $name() { + let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let txn = index.read_txn().unwrap(); + let mut map_distinct = MapDistinct::new(fid, &index, &txn); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_map_distinct!(test_string, "txt"); + test_map_distinct!(test_strings, "txts"); + test_map_distinct!(test_int, "cat-int"); + test_map_distinct!(test_ints, "cat-ints"); +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 5f2e260e4..776f0d2b3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -4,14 +4,14 @@ mod noop_distinct; use roaring::RoaringBitmap; +use crate::DocumentId; pub use facet_distinct::FacetDistinct; pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; -use crate::DocumentId; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { +pub trait DocIter: Iterator> { /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } @@ -25,3 +25,120 @@ pub trait Distinct<'a> { fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; } + +#[cfg(test)] +mod test { + use std::collections::{HashMap, HashSet}; + + use once_cell::sync::Lazy; + use rand::{seq::SliceRandom, Rng}; + use roaring::RoaringBitmap; + use serde_json::{json, Value}; + + use crate::index::{Index, tests::TempIndex}; + use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; + use crate::{BEU32, FieldId, DocumentId}; + + static JSON: Lazy = Lazy::new(generate_json); + + fn generate_json() -> Value { + let mut rng = rand::thread_rng(); + let num_docs = rng.gen_range(10..30); + + let mut documents = Vec::new(); + + let txts = ["toto", "titi", "tata"]; + let cats = (1..10).map(|i| i.to_string()).collect::>(); + let cat_ints = (1..10).collect::>(); + + for i in 0..num_docs { + let txt = txts.choose(&mut rng).unwrap(); + let mut sample_txts = cats.clone(); + sample_txts.shuffle(&mut rng); + + let mut sample_ints = cat_ints.clone(); + sample_ints.shuffle(&mut rng); + + let doc = json!({ + "id": i, + "txt": txt, + "cat-int": rng.gen_range(0..3), + "txts": sample_txts[..(rng.gen_range(0..3))], + "cat-ints": sample_ints[..(rng.gen_range(0..3))], + }); + documents.push(doc); + } + + Value::Array(documents) + } + + /// Returns a temporary index populated with random test documents, the FieldId for the + /// distinct attribute, and the RoaringBitmap with the document ids. + pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + // set distinct and faceted attributes for the index. + let builder = UpdateBuilder::new(0); + let mut update = builder.settings(&mut txn, &index); + update.set_distinct_attribute(distinct.to_string()); + if !facets.is_empty() { + update.set_faceted_fields(facets) + } + update.execute(|_, _| ()).unwrap(); + + // add documents to the index + let builder = UpdateBuilder::new(1); + let mut addition = builder.index_documents(&mut txn, &index); + + addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + addition.update_format(UpdateFormat::Json); + + addition + .execute(JSON.to_string().as_bytes(), |_, _| ()) + .unwrap(); + + let fields_map = index.fields_ids_map(&txn).unwrap(); + let fid = fields_map.id(&distinct).unwrap(); + + let map = (0..JSON.as_array().unwrap().len() as u32).collect(); + + txn.commit().unwrap(); + + (index, fid, map) + } + + + /// Checks that all the candidates are distinct, and returns the candidates number. + pub(crate) fn validate_distinct_candidates( + candidates: impl Iterator>, + distinct: FieldId, + index: &Index, + ) -> usize { + fn test(seen: &mut HashSet, value: &Value) { + match value { + Value::Null | Value::Object(_) | Value::Bool(_) => (), + Value::Number(_) | Value::String(_) => { + let s = value.to_string(); + assert!(seen.insert(s)); + } + Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} + } + } + + let mut seen = HashSet::::new(); + + let txn = index.read_txn().unwrap(); + let mut count = 0; + for candidate in candidates { + count += 1; + let candidate = candidate.unwrap(); + let id = BEU32::new(candidate); + let document = index.documents.get(&txn, &id).unwrap().unwrap(); + let value = document.get(distinct).unwrap(); + let value = serde_json::from_slice(value).unwrap(); + test(&mut seen, &value); + } + count + } +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 8f7bc7d17..3de9be631 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -3,8 +3,8 @@ use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; -/// A distinct implementer that does not perform any distinct, and simply returns an iterator to -/// the candidates. +/// A distinct implementer that does not perform any distinct, +/// and simply returns an iterator to the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { @@ -36,3 +36,22 @@ impl Distinct<'_> for NoopDistinct { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_noop() { + let candidates = (1..10).collect(); + let excluded = RoaringBitmap::new(); + let mut iter = NoopDistinct.distinct(candidates, excluded); + assert_eq!( + iter.by_ref().map(Result::unwrap).collect::>(), + (1..10).collect::>() + ); + + let excluded = iter.into_excluded(); + assert!(excluded.is_empty()); + } +}