From 45c45e11ddf9bc5c077ce07d7c7168df0ee61d2c Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 7 Apr 2021 12:38:48 +0200 Subject: [PATCH 1/4] implement distinct attribute distinct can return error facet distinct on numbers return distinct error review fixes make get_facet_value more generic fixes --- milli/src/index.rs | 13 ++ milli/src/search/criteria/asc_desc.rs | 9 +- milli/src/search/criteria/fetcher.rs | 10 +- milli/src/search/criteria/mod.rs | 7 +- milli/src/search/criteria/proximity.rs | 15 +- milli/src/search/criteria/typo.rs | 65 +++++-- milli/src/search/criteria/words.rs | 10 +- milli/src/search/distinct/facet_distinct.rs | 192 ++++++++++++++++++++ milli/src/search/distinct/map_distinct.rs | 109 +++++++++++ milli/src/search/distinct/mod.rs | 21 +++ milli/src/search/distinct/noop_distinct.rs | 36 ++++ milli/src/search/mod.rs | 63 +++++-- milli/src/update/settings.rs | 28 +++ 13 files changed, 525 insertions(+), 53 deletions(-) create mode 100644 milli/src/search/distinct/facet_distinct.rs create mode 100644 milli/src/search/distinct/map_distinct.rs create mode 100644 milli/src/search/distinct/mod.rs create mode 100644 milli/src/search/distinct/noop_distinct.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 1150edbca..59f966b95 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -19,6 +19,7 @@ use crate::{ pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; +pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; @@ -460,6 +461,18 @@ impl Index { pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } + + pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + } + + pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + } + + pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + } } #[cfg(test)] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 78ae540e4..ddd25009d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -17,7 +17,7 @@ use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; -use super::{Criterion, CriterionResult}; +use super::{Criterion, CriterionResult, CriterionContext}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -151,7 +151,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -163,7 +163,8 @@ impl<'t> Criterion for AscDesc<'t> { let bucket_candidates = take(&mut self.bucket_candidates); match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + let CriterionContext { word_cache, exclude } = context; + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { @@ -173,7 +174,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), word_cache)?; candidates.intersect_with(&self.faceted_candidates); candidates }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index fa204bdf2..dcd40e43d 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context, CriterionContext}; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -61,7 +61,7 @@ impl<'t> Fetcher<'t> { } #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self) -> anyhow::Result> { + pub fn next(&mut self, exclude: &RoaringBitmap) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -90,7 +90,11 @@ impl<'t> Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(&mut self.wdcache)? { + let context = CriterionContext { + word_cache: &mut self.wdcache, + exclude + }; + match parent.next(context)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 22f081871..5e25001a2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -20,8 +20,13 @@ mod asc_desc; mod proximity; pub mod fetcher; +pub struct CriterionContext<'a, 'b> { + exclude: &'a RoaringBitmap, + word_cache: &'b mut WordDerivationsCache, +} + pub trait Criterion { - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; + fn next(&mut self, wdcache: CriterionContext) -> anyhow::Result>; } /// The result of a call to the parent criterion. diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b62eb8cfd..45cffb93d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,7 +8,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree, CriterionContext}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -56,8 +56,9 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; + let CriterionContext { word_cache, exclude } = context; loop { debug!("Proximity at iteration {} (max {:?}) ({:?})", self.proximity, @@ -98,7 +99,7 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, candidates, - wdcache, + word_cache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -110,7 +111,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - wdcache, + word_cache, )? }; @@ -140,7 +141,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.difference_with(&candidates); @@ -170,11 +171,11 @@ impl<'t> Criterion for Proximity<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), word_cache)?, (None, None) => RoaringBitmap::new(), }; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b17b7561b..1c3942495 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, CriterionContext}; pub struct Typo<'t> { ctx: &'t dyn Context, @@ -51,8 +51,9 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; + let CriterionContext { word_cache, exclude } = context; loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -71,9 +72,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; query_tree.clone() } else { query_tree.clone() @@ -84,7 +85,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -109,9 +110,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; query_tree.clone() } else { query_tree.clone() @@ -122,7 +123,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); @@ -147,7 +148,7 @@ impl<'t> Criterion for Typo<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; @@ -346,8 +347,12 @@ mod test { let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(sort_context).unwrap().is_none()); } #[test] @@ -381,7 +386,12 @@ mod test { bucket_candidates: candidates_1, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -403,7 +413,12 @@ mod test { bucket_candidates: candidates_2, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); } #[test] @@ -421,11 +436,19 @@ mod test { bucket_candidates: facet_candidates, }; + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; // first iteration, returns the facet candidates - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; // second iteration, returns None because there is no more things to do - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(sort_context ).unwrap().is_none()); } #[test] @@ -459,7 +482,12 @@ mod test { bucket_candidates: candidates_1 & &facet_candidates, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -481,7 +509,12 @@ mod test { bucket_candidates: candidates_2 & &facet_candidates, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); } } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 8774eed7c..b401f99fa 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,8 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context, CriterionContext}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -48,7 +47,8 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + let CriterionContext { word_cache, exclude } = context; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -62,7 +62,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, word_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -100,7 +100,7 @@ impl<'t> Criterion for Words<'t> { (None, None) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { word_cache, exclude })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = candidates; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs new file mode 100644 index 000000000..cecb8ba4b --- /dev/null +++ b/milli/src/search/distinct/facet_distinct.rs @@ -0,0 +1,192 @@ +use std::mem::size_of; + +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::*; +use crate::{facet::FacetType, DocumentId, FieldId, Index}; +use super::{Distinct, DocIter}; + +pub struct FacetDistinct<'a> { + distinct: FieldId, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + facet_type: FacetType, +} + +impl<'a> FacetDistinct<'a> { + pub fn new( + distinct: FieldId, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + facet_type: FacetType, + ) -> Self { + Self { + distinct, + index, + txn, + facet_type, + } + } +} + +pub struct FacetDistinctIter<'a> { + candidates: RoaringBitmap, + distinct: FieldId, + excluded: RoaringBitmap, + facet_type: FacetType, + index: &'a Index, + iter_offset: usize, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> FacetDistinctIter<'a> { + fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result + where + KC: heed::BytesEncode<'c>, + { + let facet_docids = self + .index + .facet_field_id_value_docids + .remap_key_type::() + .get(self.txn, key)? + .expect("Corrupted data: Facet values must exist"); + Ok(facet_docids) + } + + fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::( + id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + let key = (self.distinct, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::( + id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + // get facet docids on level 0 + let key = (self.distinct, 0, value, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::(id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + // get facet docids on level 0 + let key = (self.distinct, 0, value, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn next_inner(&mut self) -> anyhow::Result> { + // The first step is to remove all the excluded documents from our candidates + self.candidates.difference_with(&self.excluded); + + let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); + match candidates_iter.next() { + Some(id) => { + match self.facet_type { + FacetType::String => self.distinct_string(id)?, + FacetType::Integer => self.distinct_integer(id)?, + FacetType::Float => self.distinct_float(id)?, + }; + + // On every iteration, the first document is always a distinct one, since it + // hasn't been discarded by the previous difference. + self.iter_offset += 1; + Ok(Some(id)) + } + // no more candidate at this offset, return. + None => Ok(None), + } + } +} + +fn get_facet_values<'a, KC>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> anyhow::Result> +where + KC: heed::BytesDecode<'a>, +{ + const FID_SIZE: usize = size_of::(); + const DOCID_SIZE: usize = size_of::(); + + let mut key = [0; FID_SIZE + DOCID_SIZE]; + key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); + key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); + + let iter = index + .field_id_docid_facet_values + .prefix_iter(txn, &key)? + .remap_key_type::(); + Ok(iter) +} + +impl Iterator for FacetDistinctIter<'_> { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.next_inner().transpose() + } +} + +impl DocIter for FacetDistinctIter<'_> { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl<'a> Distinct<'_> for FacetDistinct<'a> { + type Iter = FacetDistinctIter<'a>; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + FacetDistinctIter { + candidates, + distinct: self.distinct, + excluded, + facet_type: self.facet_type, + index: self.index, + iter_offset: 0, + txn: self.txn, + } + } +} diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs new file mode 100644 index 000000000..411d63c87 --- /dev/null +++ b/milli/src/search/distinct/map_distinct.rs @@ -0,0 +1,109 @@ +use std::collections::HashMap; + +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::{Distinct, DocIter}; +use crate::{DocumentId, FieldId, Index}; + +pub struct MapDistinct<'a> { + distinct: FieldId, + map: HashMap, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> MapDistinct<'a> { + pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { + let map = HashMap::new(); + Self { + distinct, + map, + index, + txn, + } + } +} + +pub struct MapDistinctIter<'a, 'b> { + distinct: FieldId, + map: &'b mut HashMap, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + candidates: roaring::bitmap::IntoIter, + excluded: RoaringBitmap, +} + +impl<'a, 'b> MapDistinctIter<'a, 'b> { + fn next_inner(&mut self) -> anyhow::Result> { + let map = &mut self.map; + let mut filter = |value: Value| { + let entry = map.entry(value.to_string()).or_insert(0); + *entry += 1; + *entry <= 1 + }; + + while let Some(id) = self.candidates.next() { + let document = self.index.documents(&self.txn, Some(id))?[0].1; + let value = document + .get(self.distinct) + .map(serde_json::from_slice::) + .transpose()?; + + let accept = match value { + Some(value) => { + match value { + // Since we can't distinct these values, we always accept them + Value::Null | Value::Object(_) => true, + Value::Array(values) => { + let mut accept = true; + for value in values { + accept &= filter(value); + } + accept + } + value => filter(value), + } + } + // Accept values by default. + _ => true, + }; + + if accept { + return Ok(Some(id)); + } else { + self.excluded.insert(id); + } + } + Ok(None) + } +} + +impl Iterator for MapDistinctIter<'_, '_> { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.next_inner().transpose() + } +} + +impl DocIter for MapDistinctIter<'_, '_> { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { + type Iter = MapDistinctIter<'a, 'b>; + + fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + MapDistinctIter { + distinct: self.distinct, + map: &mut self.map, + index: &self.index, + txn: &self.txn, + candidates: candidates.into_iter(), + excluded, + } + } +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs new file mode 100644 index 000000000..0f24e2d03 --- /dev/null +++ b/milli/src/search/distinct/mod.rs @@ -0,0 +1,21 @@ +mod facet_distinct; +mod map_distinct; +mod noop_distinct; + +use roaring::RoaringBitmap; + +pub use facet_distinct::FacetDistinct; +pub use map_distinct::MapDistinct; +pub use noop_distinct::NoopDistinct; +use crate::DocumentId; + +pub trait DocIter: Iterator> { + /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) + fn into_excluded(self) -> RoaringBitmap; +} + +pub trait Distinct<'a> { + type Iter: DocIter; + + fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs new file mode 100644 index 000000000..6484f4d64 --- /dev/null +++ b/milli/src/search/distinct/noop_distinct.rs @@ -0,0 +1,36 @@ +use roaring::RoaringBitmap; + +use crate::DocumentId; +use super::{DocIter, Distinct}; + +pub struct NoopDistinct; + +pub struct NoopDistinctIter { + candidates: roaring::bitmap::IntoIter, + excluded: RoaringBitmap, +} + +impl Iterator for NoopDistinctIter { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.candidates.next().map(Result::Ok) + } +} + +impl DocIter for NoopDistinctIter { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl Distinct<'_> for NoopDistinct { + type Iter = NoopDistinctIter; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + NoopDistinctIter { + candidates: candidates.into_iter(), + excluded, + } + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a8cde213b..2c55330a7 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,22 +11,24 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::fetcher::FetcherResult; +use crate::search::criteria::fetcher::{FetcherResult, Fetcher}; use crate::{Index, DocumentId}; +use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct}; +use self::query_tree::QueryTreeBuilder; pub use self::facet::FacetIter; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::query_tree::MatchingWords; -use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); +mod criteria; +mod distinct; mod facet; mod query_tree; -mod criteria; pub struct Search<'a> { query: Option, @@ -123,33 +125,60 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; + let criteria = criteria_builder.build(query_tree, facet_candidates)?; + + match self.index.distinct_attribute(self.rtxn)? { + None => self.perform_sort(NoopDistinct, matching_words, criteria), + Some(name) => { + let field_ids_map = self.index.fields_ids_map(self.rtxn)?; + let id = field_ids_map.id(name).expect("distinct not present in field map"); + let faceted_fields = self.index.faceted_fields(self.rtxn)?; + match faceted_fields.get(name) { + Some(facet_type) => { + let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); + self.perform_sort(distinct, matching_words, criteria) + } + None => { + let distinct = MapDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) + } + } + } + } + } + + fn perform_sort( + &self, + mut distinct: impl for<'c> Distinct<'c>, + matching_words: MatchingWords, + mut criteria: Fetcher, + ) -> anyhow::Result { let mut offset = self.offset; - let mut limit = self.limit; - let mut documents_ids = Vec::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { + let mut excluded_documents = RoaringBitmap::new(); + let mut documents_ids = Vec::with_capacity(self.limit); + + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_documents)? { debug!("Number of candidates found {}", candidates.len()); - let mut len = candidates.len() as usize; - let mut candidates = candidates.into_iter(); + let excluded = std::mem::take(&mut excluded_documents); + + let mut candidates = distinct.distinct(candidates, excluded); initial_candidates.union_with(&bucket_candidates); if offset != 0 { - candidates.by_ref().take(offset).for_each(drop); - offset = offset.saturating_sub(len.min(offset)); - len = len.saturating_sub(len.min(offset)); + let discarded = candidates.by_ref().take(offset).count(); + offset = offset.saturating_sub(discarded); } - if len != 0 { - documents_ids.extend(candidates.take(limit)); - limit = limit.saturating_sub(len.min(limit)); + for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { + documents_ids.push(candidate?); } - - if limit == 0 { break } + if documents_ids.len() == self.limit { break } + excluded_documents = candidates.into_excluded(); } Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index a858aa1a9..e63948082 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -70,6 +70,7 @@ pub struct Settings<'a, 't, 'u, 'i> { faceted_fields: Setting>, criteria: Setting>, stop_words: Setting>, + distinct_attribute: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -94,6 +95,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { faceted_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, + distinct_attribute: Setting::NotSet, update_id, } } @@ -142,6 +144,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { + self.distinct_attribute = Setting::Set(distinct_attribute); + } + + pub fn reset_distinct_attribute(&mut self) { + self.distinct_attribute = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -220,6 +230,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } + fn update_distinct_attribute(&mut self) -> anyhow::Result { + match self.distinct_attribute { + Setting::Set(ref attr) => { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map + .insert(attr) + .context("field id limit exceeded")?; + + self.index.put_distinct_attribute(self.wtxn, &attr)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + } + Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; }, + Setting::NotSet => return Ok(false), + } + Ok(true) + } + /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. fn update_searchable(&mut self) -> anyhow::Result { @@ -328,6 +355,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_displayed()?; let stop_words_updated = self.update_stop_words()?; let facets_updated = self.update_facets()?; + self.update_distinct_attribute()?; // update_criteria MUST be called after update_facets, since criterion fields must be set // as facets. self.update_criteria()?; From 2f73fa55ae7f9145f9dbbd2b2c83d2cee6b8e76d Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 14 Apr 2021 12:00:45 +0200 Subject: [PATCH 2/4] add documentation --- milli/src/index.rs | 25 ++++++++++++--------- milli/src/search/distinct/facet_distinct.rs | 15 +++++++++++-- milli/src/search/distinct/map_distinct.rs | 3 +++ milli/src/search/distinct/mod.rs | 8 ++++++- milli/src/search/distinct/noop_distinct.rs | 6 +++-- 5 files changed, 41 insertions(+), 16 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 59f966b95..a2b6cc440 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -343,6 +343,20 @@ impl Index { } } + /* Distinct attribute */ + + pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + } + + pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + } + + pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + } + /* criteria */ pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { @@ -462,17 +476,6 @@ impl Index { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } - pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) - } - - pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) - } - - pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) - } } #[cfg(test)] diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index cecb8ba4b..053bbd705 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,12 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; +/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// distinct attribute of the first document are retrieved. The document ids for these facet values +/// are then retrieved and taken out of the the candidate and added to the excluded set. We take +/// care to keep the document we are currently on, and remove it from the excluded list. The next +/// iterations will never contain any occurence of a document with the same distinct value as a +/// document from previous iterations. pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, @@ -114,6 +120,9 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } + /// Performs the next iteration of the facet distinct. This is a convenience method that is + /// called by the Iterator::next implementation that tranposes the result. It makes error + /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates self.candidates.difference_with(&self.excluded); @@ -127,8 +136,10 @@ impl<'a> FacetDistinctIter<'a> { FacetType::Float => self.distinct_float(id)?, }; - // On every iteration, the first document is always a distinct one, since it - // hasn't been discarded by the previous difference. + // The first document of each iteration is kept, since the next call to + // `difference_with` will filter out all the documents for that facet value. By + // increasing the offset we make sure to get the first valid value for the next + // distinct document to keep. self.iter_offset += 1; Ok(Some(id)) } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 411d63c87..37e52aa6f 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,6 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; +/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// for its distinct field is added to the map. If the map already contains an entry for this +/// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { distinct: FieldId, map: HashMap, diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 0f24e2d03..5f2e260e4 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -9,11 +9,17 @@ pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; use crate::DocumentId; +/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. +/// It provides a way to get back the ownership to the excluded set. pub trait DocIter: Iterator> { - /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) + /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } +/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct +/// must return an iterator containing only distinct documents, and add the discarded documents to +/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the +/// returned iterator. pub trait Distinct<'a> { type Iter: DocIter; diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 6484f4d64..9fdf17187 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,12 +1,14 @@ -use roaring::RoaringBitmap; +use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; +/// A distinct implementer that does not perform any distinct, and simply returns an iterator to +/// the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { - candidates: roaring::bitmap::IntoIter, + candidates: IntoIter, excluded: RoaringBitmap, } From 75464a1baadc86a8549a48a2ee3fff3f79588d30 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 14 Apr 2021 12:18:13 +0200 Subject: [PATCH 3/4] review fixes --- milli/src/index.rs | 1 - milli/src/search/criteria/asc_desc.rs | 9 ++- milli/src/search/criteria/fetcher.rs | 10 +--- milli/src/search/criteria/mod.rs | 7 +-- milli/src/search/criteria/proximity.rs | 15 +++-- milli/src/search/criteria/typo.rs | 66 ++++++---------------- milli/src/search/criteria/words.rs | 9 ++- milli/src/search/distinct/map_distinct.rs | 27 ++++----- milli/src/search/distinct/noop_distinct.rs | 2 +- milli/src/search/mod.rs | 5 +- 10 files changed, 50 insertions(+), 101 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a2b6cc440..643a9ffb9 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -475,7 +475,6 @@ impl Index { pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } - } #[cfg(test)] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ddd25009d..78ae540e4 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -17,7 +17,7 @@ use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; -use super::{Criterion, CriterionResult, CriterionContext}; +use super::{Criterion, CriterionResult}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -151,7 +151,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -163,8 +163,7 @@ impl<'t> Criterion for AscDesc<'t> { let bucket_candidates = take(&mut self.bucket_candidates); match self.parent.as_mut() { Some(parent) => { - let CriterionContext { word_cache, exclude } = context; - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { @@ -174,7 +173,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), word_cache)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; candidates.intersect_with(&self.faceted_candidates); candidates }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index dcd40e43d..fa204bdf2 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context, CriterionContext}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -61,7 +61,7 @@ impl<'t> Fetcher<'t> { } #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self, exclude: &RoaringBitmap) -> anyhow::Result> { + pub fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -90,11 +90,7 @@ impl<'t> Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - let context = CriterionContext { - word_cache: &mut self.wdcache, - exclude - }; - match parent.next(context)? { + match parent.next(&mut self.wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5e25001a2..22f081871 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -20,13 +20,8 @@ mod asc_desc; mod proximity; pub mod fetcher; -pub struct CriterionContext<'a, 'b> { - exclude: &'a RoaringBitmap, - word_cache: &'b mut WordDerivationsCache, -} - pub trait Criterion { - fn next(&mut self, wdcache: CriterionContext) -> anyhow::Result>; + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; } /// The result of a call to the parent criterion. diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 45cffb93d..b62eb8cfd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,7 +8,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree, CriterionContext}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -56,9 +56,8 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - let CriterionContext { word_cache, exclude } = context; loop { debug!("Proximity at iteration {} (max {:?}) ({:?})", self.proximity, @@ -99,7 +98,7 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, candidates, - word_cache, + wdcache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -111,7 +110,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - word_cache, + wdcache, )? }; @@ -141,7 +140,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.difference_with(&candidates); @@ -171,11 +170,11 @@ impl<'t> Criterion for Proximity<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), word_cache)?, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, (None, None) => RoaringBitmap::new(), }; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 1c3942495..3877f53ed 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, CriterionContext}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { ctx: &'t dyn Context, @@ -51,9 +51,8 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - let CriterionContext { word_cache, exclude } = context; loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -72,9 +71,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -85,7 +84,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -110,9 +109,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -123,7 +122,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); @@ -148,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; @@ -347,12 +346,8 @@ mod test { let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, facet_candidates); - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - assert!(criteria.next(sort_context).unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -386,12 +381,7 @@ mod test { bucket_candidates: candidates_1, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -413,12 +403,7 @@ mod test { bucket_candidates: candidates_2, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } #[test] @@ -436,19 +421,11 @@ mod test { bucket_candidates: facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; // first iteration, returns the facet candidates - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; // second iteration, returns None because there is no more things to do - assert!(criteria.next(sort_context ).unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -482,12 +459,7 @@ mod test { bucket_candidates: candidates_1 & &facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -509,12 +481,6 @@ mod test { bucket_candidates: candidates_2 & &facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } - } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index b401f99fa..0aa3b483a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,7 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{resolve_query_tree, Criterion, CriterionResult, Context, CriterionContext}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -47,8 +47,7 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { - let CriterionContext { word_cache, exclude } = context; + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, word_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -100,7 +99,7 @@ impl<'t> Criterion for Words<'t> { (None, None) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { word_cache, exclude })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = candidates; diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 37e52aa6f..f2e31bce4 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -18,10 +18,9 @@ pub struct MapDistinct<'a> { impl<'a> MapDistinct<'a> { pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - let map = HashMap::new(); Self { distinct, - map, + map: HashMap::new(), index, txn, } @@ -38,6 +37,9 @@ pub struct MapDistinctIter<'a, 'b> { } impl<'a, 'b> MapDistinctIter<'a, 'b> { + /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is + /// called by the Iterator::next implementation that tranposes the result. It makes error + /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { let map = &mut self.map; let mut filter = |value: Value| { @@ -54,22 +56,15 @@ impl<'a, 'b> MapDistinctIter<'a, 'b> { .transpose()?; let accept = match value { - Some(value) => { - match value { - // Since we can't distinct these values, we always accept them - Value::Null | Value::Object(_) => true, - Value::Array(values) => { - let mut accept = true; - for value in values { - accept &= filter(value); - } - accept - } - value => filter(value), + Some(Value::Array(values)) => { + let mut accept = true; + for value in values { + accept &= filter(value); } + accept } - // Accept values by default. - _ => true, + Some(Value::Null) | Some(Value::Object(_)) | None => true, + Some(value) => filter(value), }; if accept { diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 9fdf17187..8f7bc7d17 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -16,7 +16,7 @@ impl Iterator for NoopDistinctIter { type Item = anyhow::Result; fn next(&mut self) -> Option { - self.candidates.next().map(Result::Ok) + self.candidates.next().map(Ok) } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2c55330a7..7324ea72a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::hash_map::{HashMap, Entry}; use std::fmt; +use std::mem::take; use std::str::Utf8Error; use std::time::Instant; @@ -159,11 +160,11 @@ impl<'a> Search<'a> { let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_documents)? { + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); - let excluded = std::mem::take(&mut excluded_documents); + let excluded = take(&mut excluded_documents); let mut candidates = distinct.distinct(candidates, excluded); From 9c4660d3d6357c90b730b17ad6502cc4080d354c Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Thu, 15 Apr 2021 15:29:37 +0200 Subject: [PATCH 4/4] add tests --- milli/src/index.rs | 33 +++++- milli/src/search/distinct/facet_distinct.rs | 39 ++++++- milli/src/search/distinct/map_distinct.rs | 35 +++++- milli/src/search/distinct/mod.rs | 121 +++++++++++++++++++- milli/src/search/distinct/noop_distinct.rs | 23 +++- 5 files changed, 242 insertions(+), 9 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 643a9ffb9..7be618789 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -478,13 +478,44 @@ impl Index { } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::ops::Deref; + use heed::EnvOpenOptions; use maplit::hashmap; + use tempfile::TempDir; use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; + pub(crate) struct TempIndex { + inner: Index, + _tempdir: TempDir, + } + + impl Deref for TempIndex { + type Target = Index; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + + impl TempIndex { + /// Creates a temporary index, with a default `4096 * 100` size. This should be enough for + /// most tests. + pub fn new() -> Self { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 4096); + let _tempdir = TempDir::new_in(".").unwrap(); + let inner = Index::new(options, _tempdir.path()).unwrap(); + Self { + inner, + _tempdir + } + } + } + #[test] fn initial_fields_distribution() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 053bbd705..e97f8b922 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,9 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; -/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// A distinct implementer that is backed by facets. +/// +/// On each iteration, the facet values for the /// distinct attribute of the first document are retrieved. The document ids for these facet values /// are then retrieved and taken out of the the candidate and added to the excluded set. We take /// care to keep the document we are currently on, and remove it from the excluded list. The next @@ -121,7 +123,7 @@ impl<'a> FacetDistinctIter<'a> { } /// Performs the next iteration of the facet distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates @@ -201,3 +203,36 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + use crate::facet::FacetType; + + macro_rules! test_facet_distinct { + ($name:ident, $distinct:literal, $facet_type:expr) => { + #[test] + fn $name() { + use std::iter::FromIterator; + + let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let (index, fid, candidates) = generate_index($distinct, facets); + let txn = index.read_txn().unwrap(); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_facet_distinct!(test_string, "txt", FacetType::String); + test_facet_distinct!(test_strings, "txts", FacetType::String); + test_facet_distinct!(test_int, "cat-int", FacetType::Integer); + test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer); +} diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index f2e31bce4..4c01d1ded 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,7 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; -/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// A distinct implementer that is backed by an `HashMap`. +/// +/// Each time a document is seen, the value /// for its distinct field is added to the map. If the map already contains an entry for this /// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { @@ -38,7 +40,7 @@ pub struct MapDistinctIter<'a, 'b> { impl<'a, 'b> MapDistinctIter<'a, 'b> { /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { let map = &mut self.map; @@ -105,3 +107,32 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + + macro_rules! test_map_distinct { + ($name:ident, $distinct:literal) => { + #[test] + fn $name() { + let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let txn = index.read_txn().unwrap(); + let mut map_distinct = MapDistinct::new(fid, &index, &txn); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_map_distinct!(test_string, "txt"); + test_map_distinct!(test_strings, "txts"); + test_map_distinct!(test_int, "cat-int"); + test_map_distinct!(test_ints, "cat-ints"); +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 5f2e260e4..776f0d2b3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -4,14 +4,14 @@ mod noop_distinct; use roaring::RoaringBitmap; +use crate::DocumentId; pub use facet_distinct::FacetDistinct; pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; -use crate::DocumentId; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { +pub trait DocIter: Iterator> { /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } @@ -25,3 +25,120 @@ pub trait Distinct<'a> { fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; } + +#[cfg(test)] +mod test { + use std::collections::{HashMap, HashSet}; + + use once_cell::sync::Lazy; + use rand::{seq::SliceRandom, Rng}; + use roaring::RoaringBitmap; + use serde_json::{json, Value}; + + use crate::index::{Index, tests::TempIndex}; + use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; + use crate::{BEU32, FieldId, DocumentId}; + + static JSON: Lazy = Lazy::new(generate_json); + + fn generate_json() -> Value { + let mut rng = rand::thread_rng(); + let num_docs = rng.gen_range(10..30); + + let mut documents = Vec::new(); + + let txts = ["toto", "titi", "tata"]; + let cats = (1..10).map(|i| i.to_string()).collect::>(); + let cat_ints = (1..10).collect::>(); + + for i in 0..num_docs { + let txt = txts.choose(&mut rng).unwrap(); + let mut sample_txts = cats.clone(); + sample_txts.shuffle(&mut rng); + + let mut sample_ints = cat_ints.clone(); + sample_ints.shuffle(&mut rng); + + let doc = json!({ + "id": i, + "txt": txt, + "cat-int": rng.gen_range(0..3), + "txts": sample_txts[..(rng.gen_range(0..3))], + "cat-ints": sample_ints[..(rng.gen_range(0..3))], + }); + documents.push(doc); + } + + Value::Array(documents) + } + + /// Returns a temporary index populated with random test documents, the FieldId for the + /// distinct attribute, and the RoaringBitmap with the document ids. + pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + // set distinct and faceted attributes for the index. + let builder = UpdateBuilder::new(0); + let mut update = builder.settings(&mut txn, &index); + update.set_distinct_attribute(distinct.to_string()); + if !facets.is_empty() { + update.set_faceted_fields(facets) + } + update.execute(|_, _| ()).unwrap(); + + // add documents to the index + let builder = UpdateBuilder::new(1); + let mut addition = builder.index_documents(&mut txn, &index); + + addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + addition.update_format(UpdateFormat::Json); + + addition + .execute(JSON.to_string().as_bytes(), |_, _| ()) + .unwrap(); + + let fields_map = index.fields_ids_map(&txn).unwrap(); + let fid = fields_map.id(&distinct).unwrap(); + + let map = (0..JSON.as_array().unwrap().len() as u32).collect(); + + txn.commit().unwrap(); + + (index, fid, map) + } + + + /// Checks that all the candidates are distinct, and returns the candidates number. + pub(crate) fn validate_distinct_candidates( + candidates: impl Iterator>, + distinct: FieldId, + index: &Index, + ) -> usize { + fn test(seen: &mut HashSet, value: &Value) { + match value { + Value::Null | Value::Object(_) | Value::Bool(_) => (), + Value::Number(_) | Value::String(_) => { + let s = value.to_string(); + assert!(seen.insert(s)); + } + Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} + } + } + + let mut seen = HashSet::::new(); + + let txn = index.read_txn().unwrap(); + let mut count = 0; + for candidate in candidates { + count += 1; + let candidate = candidate.unwrap(); + let id = BEU32::new(candidate); + let document = index.documents.get(&txn, &id).unwrap().unwrap(); + let value = document.get(distinct).unwrap(); + let value = serde_json::from_slice(value).unwrap(); + test(&mut seen, &value); + } + count + } +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 8f7bc7d17..3de9be631 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -3,8 +3,8 @@ use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; -/// A distinct implementer that does not perform any distinct, and simply returns an iterator to -/// the candidates. +/// A distinct implementer that does not perform any distinct, +/// and simply returns an iterator to the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { @@ -36,3 +36,22 @@ impl Distinct<'_> for NoopDistinct { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_noop() { + let candidates = (1..10).collect(); + let excluded = RoaringBitmap::new(); + let mut iter = NoopDistinct.distinct(candidates, excluded); + assert_eq!( + iter.by_ref().map(Result::unwrap).collect::>(), + (1..10).collect::>() + ); + + let excluded = iter.into_excluded(); + assert!(excluded.is_empty()); + } +}