mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
Merge pull request #125 from meilisearch/distinct
Implement distinct attribute
This commit is contained in:
commit
19b6620a92
@ -19,6 +19,7 @@ use crate::{
|
|||||||
|
|
||||||
pub const CRITERIA_KEY: &str = "criteria";
|
pub const CRITERIA_KEY: &str = "criteria";
|
||||||
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
||||||
|
pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key";
|
||||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||||
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
|
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
|
||||||
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
||||||
@ -342,6 +343,20 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Distinct attribute */
|
||||||
|
|
||||||
|
pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> {
|
||||||
|
self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result<Option<&'a str>> {
|
||||||
|
self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||||
|
self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY)
|
||||||
|
}
|
||||||
|
|
||||||
/* criteria */
|
/* criteria */
|
||||||
|
|
||||||
pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> {
|
pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> {
|
||||||
@ -463,13 +478,44 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
pub(crate) mod tests {
|
||||||
|
use std::ops::Deref;
|
||||||
|
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::hashmap;
|
use maplit::hashmap;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::update::{IndexDocuments, UpdateFormat};
|
use crate::update::{IndexDocuments, UpdateFormat};
|
||||||
|
|
||||||
|
pub(crate) struct TempIndex {
|
||||||
|
inner: Index,
|
||||||
|
_tempdir: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for TempIndex {
|
||||||
|
type Target = Index;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.inner
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempIndex {
|
||||||
|
/// Creates a temporary index, with a default `4096 * 100` size. This should be enough for
|
||||||
|
/// most tests.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(100 * 4096);
|
||||||
|
let _tempdir = TempDir::new_in(".").unwrap();
|
||||||
|
let inner = Index::new(options, _tempdir.path()).unwrap();
|
||||||
|
Self {
|
||||||
|
inner,
|
||||||
|
_tempdir
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn initial_fields_distribution() {
|
fn initial_fields_distribution() {
|
||||||
let path = tempfile::tempdir().unwrap();
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
@ -483,5 +483,4 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2));
|
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,8 +5,7 @@ use log::debug;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::search::query_tree::Operation;
|
use crate::search::query_tree::Operation;
|
||||||
use crate::search::WordDerivationsCache;
|
use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache};
|
||||||
use super::{resolve_query_tree, Criterion, CriterionResult, Context};
|
|
||||||
|
|
||||||
pub struct Words<'t> {
|
pub struct Words<'t> {
|
||||||
ctx: &'t dyn Context,
|
ctx: &'t dyn Context,
|
||||||
|
238
milli/src/search/distinct/facet_distinct.rs
Normal file
238
milli/src/search/distinct/facet_distinct.rs
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
use std::mem::size_of;
|
||||||
|
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::heed_codec::facet::*;
|
||||||
|
use crate::{facet::FacetType, DocumentId, FieldId, Index};
|
||||||
|
use super::{Distinct, DocIter};
|
||||||
|
|
||||||
|
/// A distinct implementer that is backed by facets.
|
||||||
|
///
|
||||||
|
/// On each iteration, the facet values for the
|
||||||
|
/// distinct attribute of the first document are retrieved. The document ids for these facet values
|
||||||
|
/// are then retrieved and taken out of the the candidate and added to the excluded set. We take
|
||||||
|
/// care to keep the document we are currently on, and remove it from the excluded list. The next
|
||||||
|
/// iterations will never contain any occurence of a document with the same distinct value as a
|
||||||
|
/// document from previous iterations.
|
||||||
|
pub struct FacetDistinct<'a> {
|
||||||
|
distinct: FieldId,
|
||||||
|
index: &'a Index,
|
||||||
|
txn: &'a heed::RoTxn<'a>,
|
||||||
|
facet_type: FacetType,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FacetDistinct<'a> {
|
||||||
|
pub fn new(
|
||||||
|
distinct: FieldId,
|
||||||
|
index: &'a Index,
|
||||||
|
txn: &'a heed::RoTxn<'a>,
|
||||||
|
facet_type: FacetType,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
distinct,
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
facet_type,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FacetDistinctIter<'a> {
|
||||||
|
candidates: RoaringBitmap,
|
||||||
|
distinct: FieldId,
|
||||||
|
excluded: RoaringBitmap,
|
||||||
|
facet_type: FacetType,
|
||||||
|
index: &'a Index,
|
||||||
|
iter_offset: usize,
|
||||||
|
txn: &'a heed::RoTxn<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FacetDistinctIter<'a> {
|
||||||
|
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap>
|
||||||
|
where
|
||||||
|
KC: heed::BytesEncode<'c>,
|
||||||
|
{
|
||||||
|
let facet_docids = self
|
||||||
|
.index
|
||||||
|
.facet_field_id_value_docids
|
||||||
|
.remap_key_type::<KC>()
|
||||||
|
.get(self.txn, key)?
|
||||||
|
.expect("Corrupted data: Facet values must exist");
|
||||||
|
Ok(facet_docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
|
||||||
|
let iter = get_facet_values::<FieldDocIdFacetStringCodec>(
|
||||||
|
id,
|
||||||
|
self.distinct,
|
||||||
|
self.index,
|
||||||
|
self.txn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
for item in iter {
|
||||||
|
let ((_, _, value), _) = item?;
|
||||||
|
let key = (self.distinct, value);
|
||||||
|
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?;
|
||||||
|
self.excluded.union_with(&facet_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.excluded.remove(id);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> {
|
||||||
|
let iter = get_facet_values::<FieldDocIdFacetI64Codec>(
|
||||||
|
id,
|
||||||
|
self.distinct,
|
||||||
|
self.index,
|
||||||
|
self.txn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
for item in iter {
|
||||||
|
let ((_, _, value), _) = item?;
|
||||||
|
// get facet docids on level 0
|
||||||
|
let key = (self.distinct, 0, value, value);
|
||||||
|
let facet_docids = self.get_facet_docids::<FacetLevelValueI64Codec>(&key)?;
|
||||||
|
self.excluded.union_with(&facet_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.excluded.remove(id);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> {
|
||||||
|
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id,
|
||||||
|
self.distinct,
|
||||||
|
self.index,
|
||||||
|
self.txn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
for item in iter {
|
||||||
|
let ((_, _, value), _) = item?;
|
||||||
|
// get facet docids on level 0
|
||||||
|
let key = (self.distinct, 0, value, value);
|
||||||
|
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?;
|
||||||
|
self.excluded.union_with(&facet_docids);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.excluded.remove(id);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Performs the next iteration of the facet distinct. This is a convenience method that is
|
||||||
|
/// called by the Iterator::next implementation that transposes the result. It makes error
|
||||||
|
/// handling easier.
|
||||||
|
fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> {
|
||||||
|
// The first step is to remove all the excluded documents from our candidates
|
||||||
|
self.candidates.difference_with(&self.excluded);
|
||||||
|
|
||||||
|
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
|
||||||
|
match candidates_iter.next() {
|
||||||
|
Some(id) => {
|
||||||
|
match self.facet_type {
|
||||||
|
FacetType::String => self.distinct_string(id)?,
|
||||||
|
FacetType::Integer => self.distinct_integer(id)?,
|
||||||
|
FacetType::Float => self.distinct_float(id)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
// The first document of each iteration is kept, since the next call to
|
||||||
|
// `difference_with` will filter out all the documents for that facet value. By
|
||||||
|
// increasing the offset we make sure to get the first valid value for the next
|
||||||
|
// distinct document to keep.
|
||||||
|
self.iter_offset += 1;
|
||||||
|
Ok(Some(id))
|
||||||
|
}
|
||||||
|
// no more candidate at this offset, return.
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_facet_values<'a, KC>(
|
||||||
|
id: DocumentId,
|
||||||
|
distinct: FieldId,
|
||||||
|
index: &Index,
|
||||||
|
txn: &'a heed::RoTxn,
|
||||||
|
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>>
|
||||||
|
where
|
||||||
|
KC: heed::BytesDecode<'a>,
|
||||||
|
{
|
||||||
|
const FID_SIZE: usize = size_of::<FieldId>();
|
||||||
|
const DOCID_SIZE: usize = size_of::<DocumentId>();
|
||||||
|
|
||||||
|
let mut key = [0; FID_SIZE + DOCID_SIZE];
|
||||||
|
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
|
||||||
|
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
|
||||||
|
|
||||||
|
let iter = index
|
||||||
|
.field_id_docid_facet_values
|
||||||
|
.prefix_iter(txn, &key)?
|
||||||
|
.remap_key_type::<KC>();
|
||||||
|
Ok(iter)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for FacetDistinctIter<'_> {
|
||||||
|
type Item = anyhow::Result<DocumentId>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.next_inner().transpose()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocIter for FacetDistinctIter<'_> {
|
||||||
|
fn into_excluded(self) -> RoaringBitmap {
|
||||||
|
self.excluded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Distinct<'_> for FacetDistinct<'a> {
|
||||||
|
type Iter = FacetDistinctIter<'a>;
|
||||||
|
|
||||||
|
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||||
|
FacetDistinctIter {
|
||||||
|
candidates,
|
||||||
|
distinct: self.distinct,
|
||||||
|
excluded,
|
||||||
|
facet_type: self.facet_type,
|
||||||
|
index: self.index,
|
||||||
|
iter_offset: 0,
|
||||||
|
txn: self.txn,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use super::super::test::{generate_index, validate_distinct_candidates};
|
||||||
|
use crate::facet::FacetType;
|
||||||
|
|
||||||
|
macro_rules! test_facet_distinct {
|
||||||
|
($name:ident, $distinct:literal, $facet_type:expr) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
|
let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string())));
|
||||||
|
let (index, fid, candidates) = generate_index($distinct, facets);
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);
|
||||||
|
let excluded = RoaringBitmap::new();
|
||||||
|
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
|
||||||
|
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
|
||||||
|
let excluded = iter.into_excluded();
|
||||||
|
assert_eq!(count as u64 + excluded.len(), candidates.len());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
test_facet_distinct!(test_string, "txt", FacetType::String);
|
||||||
|
test_facet_distinct!(test_strings, "txts", FacetType::String);
|
||||||
|
test_facet_distinct!(test_int, "cat-int", FacetType::Integer);
|
||||||
|
test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer);
|
||||||
|
}
|
138
milli/src/search/distinct/map_distinct.rs
Normal file
138
milli/src/search/distinct/map_distinct.rs
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use super::{Distinct, DocIter};
|
||||||
|
use crate::{DocumentId, FieldId, Index};
|
||||||
|
|
||||||
|
/// A distinct implementer that is backed by an `HashMap`.
|
||||||
|
///
|
||||||
|
/// Each time a document is seen, the value
|
||||||
|
/// for its distinct field is added to the map. If the map already contains an entry for this
|
||||||
|
/// value, then the document is filtered out, and is added to the excluded set.
|
||||||
|
pub struct MapDistinct<'a> {
|
||||||
|
distinct: FieldId,
|
||||||
|
map: HashMap<String, usize>,
|
||||||
|
index: &'a Index,
|
||||||
|
txn: &'a heed::RoTxn<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> MapDistinct<'a> {
|
||||||
|
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
|
||||||
|
Self {
|
||||||
|
distinct,
|
||||||
|
map: HashMap::new(),
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MapDistinctIter<'a, 'b> {
|
||||||
|
distinct: FieldId,
|
||||||
|
map: &'b mut HashMap<String, usize>,
|
||||||
|
index: &'a Index,
|
||||||
|
txn: &'a heed::RoTxn<'a>,
|
||||||
|
candidates: roaring::bitmap::IntoIter,
|
||||||
|
excluded: RoaringBitmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'b> MapDistinctIter<'a, 'b> {
|
||||||
|
/// Performs the next iteration of the mafacetp distinct. This is a convenience method that is
|
||||||
|
/// called by the Iterator::next implementation that transposes the result. It makes error
|
||||||
|
/// handling easier.
|
||||||
|
fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> {
|
||||||
|
let map = &mut self.map;
|
||||||
|
let mut filter = |value: Value| {
|
||||||
|
let entry = map.entry(value.to_string()).or_insert(0);
|
||||||
|
*entry += 1;
|
||||||
|
*entry <= 1
|
||||||
|
};
|
||||||
|
|
||||||
|
while let Some(id) = self.candidates.next() {
|
||||||
|
let document = self.index.documents(&self.txn, Some(id))?[0].1;
|
||||||
|
let value = document
|
||||||
|
.get(self.distinct)
|
||||||
|
.map(serde_json::from_slice::<Value>)
|
||||||
|
.transpose()?;
|
||||||
|
|
||||||
|
let accept = match value {
|
||||||
|
Some(Value::Array(values)) => {
|
||||||
|
let mut accept = true;
|
||||||
|
for value in values {
|
||||||
|
accept &= filter(value);
|
||||||
|
}
|
||||||
|
accept
|
||||||
|
}
|
||||||
|
Some(Value::Null) | Some(Value::Object(_)) | None => true,
|
||||||
|
Some(value) => filter(value),
|
||||||
|
};
|
||||||
|
|
||||||
|
if accept {
|
||||||
|
return Ok(Some(id));
|
||||||
|
} else {
|
||||||
|
self.excluded.insert(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for MapDistinctIter<'_, '_> {
|
||||||
|
type Item = anyhow::Result<DocumentId>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.next_inner().transpose()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocIter for MapDistinctIter<'_, '_> {
|
||||||
|
fn into_excluded(self) -> RoaringBitmap {
|
||||||
|
self.excluded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
|
||||||
|
type Iter = MapDistinctIter<'a, 'b>;
|
||||||
|
|
||||||
|
fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||||
|
MapDistinctIter {
|
||||||
|
distinct: self.distinct,
|
||||||
|
map: &mut self.map,
|
||||||
|
index: &self.index,
|
||||||
|
txn: &self.txn,
|
||||||
|
candidates: candidates.into_iter(),
|
||||||
|
excluded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use super::super::test::{generate_index, validate_distinct_candidates};
|
||||||
|
|
||||||
|
macro_rules! test_map_distinct {
|
||||||
|
($name:ident, $distinct:literal) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
let (index, fid, candidates) = generate_index($distinct, HashMap::new());
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut map_distinct = MapDistinct::new(fid, &index, &txn);
|
||||||
|
let excluded = RoaringBitmap::new();
|
||||||
|
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
|
||||||
|
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
|
||||||
|
let excluded = iter.into_excluded();
|
||||||
|
assert_eq!(count as u64 + excluded.len(), candidates.len());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
test_map_distinct!(test_string, "txt");
|
||||||
|
test_map_distinct!(test_strings, "txts");
|
||||||
|
test_map_distinct!(test_int, "cat-int");
|
||||||
|
test_map_distinct!(test_ints, "cat-ints");
|
||||||
|
}
|
144
milli/src/search/distinct/mod.rs
Normal file
144
milli/src/search/distinct/mod.rs
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
mod facet_distinct;
|
||||||
|
mod map_distinct;
|
||||||
|
mod noop_distinct;
|
||||||
|
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::DocumentId;
|
||||||
|
pub use facet_distinct::FacetDistinct;
|
||||||
|
pub use map_distinct::MapDistinct;
|
||||||
|
pub use noop_distinct::NoopDistinct;
|
||||||
|
|
||||||
|
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
|
||||||
|
/// It provides a way to get back the ownership to the excluded set.
|
||||||
|
pub trait DocIter: Iterator<Item = anyhow::Result<DocumentId>> {
|
||||||
|
/// Returns ownership on the internal exluded set.
|
||||||
|
fn into_excluded(self) -> RoaringBitmap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct
|
||||||
|
/// must return an iterator containing only distinct documents, and add the discarded documents to
|
||||||
|
/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the
|
||||||
|
/// returned iterator.
|
||||||
|
pub trait Distinct<'a> {
|
||||||
|
type Iter: DocIter;
|
||||||
|
|
||||||
|
fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use rand::{seq::SliceRandom, Rng};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use crate::index::{Index, tests::TempIndex};
|
||||||
|
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
|
||||||
|
use crate::{BEU32, FieldId, DocumentId};
|
||||||
|
|
||||||
|
static JSON: Lazy<Value> = Lazy::new(generate_json);
|
||||||
|
|
||||||
|
fn generate_json() -> Value {
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
let num_docs = rng.gen_range(10..30);
|
||||||
|
|
||||||
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
|
let txts = ["toto", "titi", "tata"];
|
||||||
|
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||||
|
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||||
|
|
||||||
|
for i in 0..num_docs {
|
||||||
|
let txt = txts.choose(&mut rng).unwrap();
|
||||||
|
let mut sample_txts = cats.clone();
|
||||||
|
sample_txts.shuffle(&mut rng);
|
||||||
|
|
||||||
|
let mut sample_ints = cat_ints.clone();
|
||||||
|
sample_ints.shuffle(&mut rng);
|
||||||
|
|
||||||
|
let doc = json!({
|
||||||
|
"id": i,
|
||||||
|
"txt": txt,
|
||||||
|
"cat-int": rng.gen_range(0..3),
|
||||||
|
"txts": sample_txts[..(rng.gen_range(0..3))],
|
||||||
|
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
||||||
|
});
|
||||||
|
documents.push(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value::Array(documents)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a temporary index populated with random test documents, the FieldId for the
|
||||||
|
/// distinct attribute, and the RoaringBitmap with the document ids.
|
||||||
|
pub(crate) fn generate_index(distinct: &str, facets: HashMap<String, String>) -> (TempIndex, FieldId, RoaringBitmap) {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
let mut txn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
// set distinct and faceted attributes for the index.
|
||||||
|
let builder = UpdateBuilder::new(0);
|
||||||
|
let mut update = builder.settings(&mut txn, &index);
|
||||||
|
update.set_distinct_attribute(distinct.to_string());
|
||||||
|
if !facets.is_empty() {
|
||||||
|
update.set_faceted_fields(facets)
|
||||||
|
}
|
||||||
|
update.execute(|_, _| ()).unwrap();
|
||||||
|
|
||||||
|
// add documents to the index
|
||||||
|
let builder = UpdateBuilder::new(1);
|
||||||
|
let mut addition = builder.index_documents(&mut txn, &index);
|
||||||
|
|
||||||
|
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
addition.update_format(UpdateFormat::Json);
|
||||||
|
|
||||||
|
addition
|
||||||
|
.execute(JSON.to_string().as_bytes(), |_, _| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||||
|
let fid = fields_map.id(&distinct).unwrap();
|
||||||
|
|
||||||
|
let map = (0..JSON.as_array().unwrap().len() as u32).collect();
|
||||||
|
|
||||||
|
txn.commit().unwrap();
|
||||||
|
|
||||||
|
(index, fid, map)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Checks that all the candidates are distinct, and returns the candidates number.
|
||||||
|
pub(crate) fn validate_distinct_candidates(
|
||||||
|
candidates: impl Iterator<Item=anyhow::Result<DocumentId>>,
|
||||||
|
distinct: FieldId,
|
||||||
|
index: &Index,
|
||||||
|
) -> usize {
|
||||||
|
fn test(seen: &mut HashSet<String>, value: &Value) {
|
||||||
|
match value {
|
||||||
|
Value::Null | Value::Object(_) | Value::Bool(_) => (),
|
||||||
|
Value::Number(_) | Value::String(_) => {
|
||||||
|
let s = value.to_string();
|
||||||
|
assert!(seen.insert(s));
|
||||||
|
}
|
||||||
|
Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut seen = HashSet::<String>::new();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut count = 0;
|
||||||
|
for candidate in candidates {
|
||||||
|
count += 1;
|
||||||
|
let candidate = candidate.unwrap();
|
||||||
|
let id = BEU32::new(candidate);
|
||||||
|
let document = index.documents.get(&txn, &id).unwrap().unwrap();
|
||||||
|
let value = document.get(distinct).unwrap();
|
||||||
|
let value = serde_json::from_slice(value).unwrap();
|
||||||
|
test(&mut seen, &value);
|
||||||
|
}
|
||||||
|
count
|
||||||
|
}
|
||||||
|
}
|
57
milli/src/search/distinct/noop_distinct.rs
Normal file
57
milli/src/search/distinct/noop_distinct.rs
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
use roaring::{RoaringBitmap, bitmap::IntoIter};
|
||||||
|
|
||||||
|
use crate::DocumentId;
|
||||||
|
use super::{DocIter, Distinct};
|
||||||
|
|
||||||
|
/// A distinct implementer that does not perform any distinct,
|
||||||
|
/// and simply returns an iterator to the candidates.
|
||||||
|
pub struct NoopDistinct;
|
||||||
|
|
||||||
|
pub struct NoopDistinctIter {
|
||||||
|
candidates: IntoIter,
|
||||||
|
excluded: RoaringBitmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for NoopDistinctIter {
|
||||||
|
type Item = anyhow::Result<DocumentId>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.candidates.next().map(Ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocIter for NoopDistinctIter {
|
||||||
|
fn into_excluded(self) -> RoaringBitmap {
|
||||||
|
self.excluded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Distinct<'_> for NoopDistinct {
|
||||||
|
type Iter = NoopDistinctIter;
|
||||||
|
|
||||||
|
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||||
|
NoopDistinctIter {
|
||||||
|
candidates: candidates.into_iter(),
|
||||||
|
excluded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_noop() {
|
||||||
|
let candidates = (1..10).collect();
|
||||||
|
let excluded = RoaringBitmap::new();
|
||||||
|
let mut iter = NoopDistinct.distinct(candidates, excluded);
|
||||||
|
assert_eq!(
|
||||||
|
iter.by_ref().map(Result::unwrap).collect::<Vec<_>>(),
|
||||||
|
(1..10).collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
|
||||||
|
let excluded = iter.into_excluded();
|
||||||
|
assert!(excluded.is_empty());
|
||||||
|
}
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::hash_map::{HashMap, Entry};
|
use std::collections::hash_map::{HashMap, Entry};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use std::mem::take;
|
||||||
use std::str::Utf8Error;
|
use std::str::Utf8Error;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
@ -11,22 +12,24 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use roaring::bitmap::RoaringBitmap;
|
use roaring::bitmap::RoaringBitmap;
|
||||||
|
|
||||||
use crate::search::criteria::fetcher::FetcherResult;
|
use crate::search::criteria::fetcher::{FetcherResult, Fetcher};
|
||||||
use crate::{Index, DocumentId};
|
use crate::{Index, DocumentId};
|
||||||
|
use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct};
|
||||||
|
use self::query_tree::QueryTreeBuilder;
|
||||||
|
|
||||||
pub use self::facet::FacetIter;
|
pub use self::facet::FacetIter;
|
||||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
||||||
pub use self::query_tree::MatchingWords;
|
pub use self::query_tree::MatchingWords;
|
||||||
use self::query_tree::QueryTreeBuilder;
|
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||||
|
|
||||||
|
mod criteria;
|
||||||
|
mod distinct;
|
||||||
mod facet;
|
mod facet;
|
||||||
mod query_tree;
|
mod query_tree;
|
||||||
mod criteria;
|
|
||||||
|
|
||||||
pub struct Search<'a> {
|
pub struct Search<'a> {
|
||||||
query: Option<String>,
|
query: Option<String>,
|
||||||
@ -123,33 +126,60 @@ impl<'a> Search<'a> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
|
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
|
||||||
let mut criteria = criteria_builder.build(query_tree, facet_candidates)?;
|
let criteria = criteria_builder.build(query_tree, facet_candidates)?;
|
||||||
|
|
||||||
|
match self.index.distinct_attribute(self.rtxn)? {
|
||||||
|
None => self.perform_sort(NoopDistinct, matching_words, criteria),
|
||||||
|
Some(name) => {
|
||||||
|
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||||
|
let id = field_ids_map.id(name).expect("distinct not present in field map");
|
||||||
|
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
|
||||||
|
match faceted_fields.get(name) {
|
||||||
|
Some(facet_type) => {
|
||||||
|
let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type);
|
||||||
|
self.perform_sort(distinct, matching_words, criteria)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let distinct = MapDistinct::new(id, self.index, self.rtxn);
|
||||||
|
self.perform_sort(distinct, matching_words, criteria)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn perform_sort(
|
||||||
|
&self,
|
||||||
|
mut distinct: impl for<'c> Distinct<'c>,
|
||||||
|
matching_words: MatchingWords,
|
||||||
|
mut criteria: Fetcher,
|
||||||
|
) -> anyhow::Result<SearchResult> {
|
||||||
|
|
||||||
let mut offset = self.offset;
|
let mut offset = self.offset;
|
||||||
let mut limit = self.limit;
|
|
||||||
let mut documents_ids = Vec::new();
|
|
||||||
let mut initial_candidates = RoaringBitmap::new();
|
let mut initial_candidates = RoaringBitmap::new();
|
||||||
|
let mut excluded_documents = RoaringBitmap::new();
|
||||||
|
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||||
|
|
||||||
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
||||||
|
|
||||||
debug!("Number of candidates found {}", candidates.len());
|
debug!("Number of candidates found {}", candidates.len());
|
||||||
|
|
||||||
let mut len = candidates.len() as usize;
|
let excluded = take(&mut excluded_documents);
|
||||||
let mut candidates = candidates.into_iter();
|
|
||||||
|
let mut candidates = distinct.distinct(candidates, excluded);
|
||||||
|
|
||||||
initial_candidates.union_with(&bucket_candidates);
|
initial_candidates.union_with(&bucket_candidates);
|
||||||
|
|
||||||
if offset != 0 {
|
if offset != 0 {
|
||||||
candidates.by_ref().take(offset).for_each(drop);
|
let discarded = candidates.by_ref().take(offset).count();
|
||||||
offset = offset.saturating_sub(len.min(offset));
|
offset = offset.saturating_sub(discarded);
|
||||||
len = len.saturating_sub(len.min(offset));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len != 0 {
|
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
|
||||||
documents_ids.extend(candidates.take(limit));
|
documents_ids.push(candidate?);
|
||||||
limit = limit.saturating_sub(len.min(limit));
|
|
||||||
}
|
}
|
||||||
|
if documents_ids.len() == self.limit { break }
|
||||||
if limit == 0 { break }
|
excluded_documents = candidates.into_excluded();
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
|
Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
|
||||||
|
@ -70,6 +70,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
faceted_fields: Setting<HashMap<String, String>>,
|
faceted_fields: Setting<HashMap<String, String>>,
|
||||||
criteria: Setting<Vec<String>>,
|
criteria: Setting<Vec<String>>,
|
||||||
stop_words: Setting<BTreeSet<String>>,
|
stop_words: Setting<BTreeSet<String>>,
|
||||||
|
distinct_attribute: Setting<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||||
@ -94,6 +95,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
faceted_fields: Setting::NotSet,
|
faceted_fields: Setting::NotSet,
|
||||||
criteria: Setting::NotSet,
|
criteria: Setting::NotSet,
|
||||||
stop_words: Setting::NotSet,
|
stop_words: Setting::NotSet,
|
||||||
|
distinct_attribute: Setting::NotSet,
|
||||||
update_id,
|
update_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,6 +144,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_distinct_attribute(&mut self, distinct_attribute: String) {
|
||||||
|
self.distinct_attribute = Setting::Set(distinct_attribute);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset_distinct_attribute(&mut self) {
|
||||||
|
self.distinct_attribute = Setting::Reset;
|
||||||
|
}
|
||||||
|
|
||||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
|
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep, u64) + Sync
|
F: Fn(UpdateIndexingStep, u64) + Sync
|
||||||
@ -220,6 +230,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_distinct_attribute(&mut self) -> anyhow::Result<bool> {
|
||||||
|
match self.distinct_attribute {
|
||||||
|
Setting::Set(ref attr) => {
|
||||||
|
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||||
|
fields_ids_map
|
||||||
|
.insert(attr)
|
||||||
|
.context("field id limit exceeded")?;
|
||||||
|
|
||||||
|
self.index.put_distinct_attribute(self.wtxn, &attr)?;
|
||||||
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||||
|
}
|
||||||
|
Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; },
|
||||||
|
Setting::NotSet => return Ok(false),
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
/// Updates the index's searchable attributes. This causes the field map to be recomputed to
|
/// Updates the index's searchable attributes. This causes the field map to be recomputed to
|
||||||
/// reflect the order of the searchable attributes.
|
/// reflect the order of the searchable attributes.
|
||||||
fn update_searchable(&mut self) -> anyhow::Result<bool> {
|
fn update_searchable(&mut self) -> anyhow::Result<bool> {
|
||||||
@ -328,6 +355,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.update_displayed()?;
|
self.update_displayed()?;
|
||||||
let stop_words_updated = self.update_stop_words()?;
|
let stop_words_updated = self.update_stop_words()?;
|
||||||
let facets_updated = self.update_facets()?;
|
let facets_updated = self.update_facets()?;
|
||||||
|
self.update_distinct_attribute()?;
|
||||||
// update_criteria MUST be called after update_facets, since criterion fields must be set
|
// update_criteria MUST be called after update_facets, since criterion fields must be set
|
||||||
// as facets.
|
// as facets.
|
||||||
self.update_criteria()?;
|
self.update_criteria()?;
|
||||||
|
Loading…
Reference in New Issue
Block a user