2021-04-07 12:38:48 +02:00
|
|
|
mod facet_distinct;
|
|
|
|
mod noop_distinct;
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
pub use facet_distinct::FacetDistinct;
|
|
|
|
pub use noop_distinct::NoopDistinct;
|
2021-04-07 12:38:48 +02:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2021-06-14 16:46:19 +02:00
|
|
|
use crate::{DocumentId, Result};
|
2021-04-07 12:38:48 +02:00
|
|
|
|
2021-04-14 12:00:45 +02:00
|
|
|
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
|
|
|
|
/// It provides a way to get back the ownership to the excluded set.
|
2021-06-14 16:46:19 +02:00
|
|
|
pub trait DocIter: Iterator<Item = Result<DocumentId>> {
|
2021-04-14 12:00:45 +02:00
|
|
|
/// Returns ownership on the internal exluded set.
|
2021-04-07 12:38:48 +02:00
|
|
|
fn into_excluded(self) -> RoaringBitmap;
|
|
|
|
}
|
|
|
|
|
2021-04-14 12:00:45 +02:00
|
|
|
/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct
|
|
|
|
/// must return an iterator containing only distinct documents, and add the discarded documents to
|
|
|
|
/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the
|
|
|
|
/// returned iterator.
|
2021-06-01 14:43:48 +02:00
|
|
|
pub trait Distinct {
|
2021-04-07 12:38:48 +02:00
|
|
|
type Iter: DocIter;
|
|
|
|
|
2021-06-01 14:43:48 +02:00
|
|
|
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
|
2021-04-07 12:38:48 +02:00
|
|
|
}
|
2021-04-15 15:29:37 +02:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
2021-05-03 15:58:47 +02:00
|
|
|
use std::collections::HashSet;
|
2021-08-31 11:44:15 +02:00
|
|
|
use std::io::Cursor;
|
2021-04-15 15:29:37 +02:00
|
|
|
|
|
|
|
use once_cell::sync::Lazy;
|
2021-06-16 18:33:33 +02:00
|
|
|
use rand::seq::SliceRandom;
|
|
|
|
use rand::Rng;
|
2021-04-15 15:29:37 +02:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
use serde_json::{json, Value};
|
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
2021-06-16 18:33:33 +02:00
|
|
|
use crate::index::tests::TempIndex;
|
|
|
|
use crate::index::Index;
|
2021-12-08 14:12:07 +01:00
|
|
|
use crate::update::{
|
|
|
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
|
|
|
};
|
2021-06-16 18:33:33 +02:00
|
|
|
use crate::{DocumentId, FieldId, BEU32};
|
2021-04-15 15:29:37 +02:00
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
|
2021-04-15 15:29:37 +02:00
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
fn generate_documents() -> Vec<u8> {
|
2021-04-15 15:29:37 +02:00
|
|
|
let mut rng = rand::thread_rng();
|
|
|
|
let num_docs = rng.gen_range(10..30);
|
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
let mut cursor = Cursor::new(Vec::new());
|
|
|
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
2021-07-17 12:50:01 +02:00
|
|
|
let txts = ["Toto", "Titi", "Tata"];
|
2021-04-15 15:29:37 +02:00
|
|
|
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
|
|
|
let cat_ints = (1..10).collect::<Vec<_>>();
|
|
|
|
|
|
|
|
for i in 0..num_docs {
|
|
|
|
let txt = txts.choose(&mut rng).unwrap();
|
|
|
|
let mut sample_txts = cats.clone();
|
|
|
|
sample_txts.shuffle(&mut rng);
|
|
|
|
|
|
|
|
let mut sample_ints = cat_ints.clone();
|
|
|
|
sample_ints.shuffle(&mut rng);
|
|
|
|
|
|
|
|
let doc = json!({
|
|
|
|
"id": i,
|
|
|
|
"txt": txt,
|
|
|
|
"cat-int": rng.gen_range(0..3),
|
|
|
|
"txts": sample_txts[..(rng.gen_range(0..3))],
|
|
|
|
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
|
|
|
});
|
2021-10-24 14:41:36 +02:00
|
|
|
|
|
|
|
let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
|
|
|
|
builder.extend_from_json(doc).unwrap();
|
2021-04-15 15:29:37 +02:00
|
|
|
}
|
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
builder.finish().unwrap();
|
|
|
|
cursor.into_inner()
|
2021-04-15 15:29:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns a temporary index populated with random test documents, the FieldId for the
|
|
|
|
/// distinct attribute, and the RoaringBitmap with the document ids.
|
2021-06-01 12:32:03 +02:00
|
|
|
pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) {
|
2021-04-15 15:29:37 +02:00
|
|
|
let index = TempIndex::new();
|
|
|
|
let mut txn = index.write_txn().unwrap();
|
|
|
|
|
|
|
|
// set distinct and faceted attributes for the index.
|
2021-12-08 14:12:07 +01:00
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let mut update = Settings::new(&mut txn, &index, &config);
|
2021-06-01 16:29:14 +02:00
|
|
|
update.set_distinct_field(distinct.to_string());
|
2021-11-03 13:12:01 +01:00
|
|
|
update.execute(|_| ()).unwrap();
|
2021-04-15 15:29:37 +02:00
|
|
|
|
|
|
|
// add documents to the index
|
2021-12-08 14:12:07 +01:00
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let indexing_config = IndexDocumentsConfig {
|
|
|
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
|
|
|
..Default::default()
|
|
|
|
};
|
2022-03-23 17:28:41 +01:00
|
|
|
let mut addition =
|
|
|
|
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
2021-04-15 15:29:37 +02:00
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
let reader =
|
|
|
|
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
2021-12-08 14:12:07 +01:00
|
|
|
|
|
|
|
addition.add_documents(reader).unwrap();
|
|
|
|
addition.execute().unwrap();
|
2021-04-15 15:29:37 +02:00
|
|
|
|
|
|
|
let fields_map = index.fields_ids_map(&txn).unwrap();
|
|
|
|
let fid = fields_map.id(&distinct).unwrap();
|
|
|
|
|
2021-08-31 11:44:15 +02:00
|
|
|
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
|
|
|
let map = (0..documents.len() as u32).collect();
|
2021-04-15 15:29:37 +02:00
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
|
|
|
|
(index, fid, map)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks that all the candidates are distinct, and returns the candidates number.
|
|
|
|
pub(crate) fn validate_distinct_candidates(
|
2021-06-14 16:46:19 +02:00
|
|
|
candidates: impl Iterator<Item = crate::Result<DocumentId>>,
|
2021-04-15 15:29:37 +02:00
|
|
|
distinct: FieldId,
|
|
|
|
index: &Index,
|
2021-06-16 18:33:33 +02:00
|
|
|
) -> usize {
|
2021-04-15 15:29:37 +02:00
|
|
|
fn test(seen: &mut HashSet<String>, value: &Value) {
|
|
|
|
match value {
|
|
|
|
Value::Null | Value::Object(_) | Value::Bool(_) => (),
|
|
|
|
Value::Number(_) | Value::String(_) => {
|
|
|
|
let s = value.to_string();
|
|
|
|
assert!(seen.insert(s));
|
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)),
|
2021-04-15 15:29:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut seen = HashSet::<String>::new();
|
|
|
|
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
let mut count = 0;
|
|
|
|
for candidate in candidates {
|
|
|
|
count += 1;
|
|
|
|
let candidate = candidate.unwrap();
|
|
|
|
let id = BEU32::new(candidate);
|
|
|
|
let document = index.documents.get(&txn, &id).unwrap().unwrap();
|
|
|
|
let value = document.get(distinct).unwrap();
|
|
|
|
let value = serde_json::from_slice(value).unwrap();
|
|
|
|
test(&mut seen, &value);
|
|
|
|
}
|
|
|
|
count
|
|
|
|
}
|
|
|
|
}
|