mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Merge #254
254: Improve the facet string distribution speed r=Kerollmops a=Kerollmops
This pull request creates a data structure similar to the one we use for the faceted numbers, a tetratomic decision tree but this time for the facet strings. This PR also changes the facet distribution behavior by returning one of the original facet values, fixes #260.
This data structure defines bucket-like structures where documents ids are stored under their facet value and helps the search decide if it wants to move to a lower level under a given bucket or not, depending on if the current bucket contains interesting documents or not. The whole format, algorithm, and previous attempts are explained in the [`facet_string.rs` file](ec1cfdd42b/milli/src/search/facet/facet_string.rs
).
Note that this data structure **could** be used to sort by string lexicographically, that hypothetically possible. We need more testing, in terms of performance and quality, as we will sort on lowercased versions of the facet values.
- [x] Implement a faster and more precise way to fetch the facet distribution.
- [x] Store and return the original facet string value. We currently return the lowercased version.
Co-authored-by: Kerollmops <clement@meilisearch.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
commit
77de82aaa4
@ -627,14 +627,14 @@ fn facet_values_docids(
|
||||
FacetType::String => {
|
||||
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
|
||||
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
|
||||
let ((_fid, value), docids) = result?;
|
||||
let ((_fid, normalized), (_original, docids)) = result?;
|
||||
let count = docids.len();
|
||||
let docids = if debug {
|
||||
format!("{:?}", docids)
|
||||
} else {
|
||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||
};
|
||||
wtr.write_record(&[value.to_string(), count.to_string(), docids])?;
|
||||
wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
53
milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
Normal file
53
milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
Normal file
@ -0,0 +1,53 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::num::NonZeroU8;
|
||||
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
/// A codec that stores the field id, level 1 and higher and the groups ids.
|
||||
///
|
||||
/// It can only be used to encode the facet string of the level 1 or higher.
|
||||
pub struct FacetLevelValueU32Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec {
|
||||
type DItem = (FieldId, NonZeroU8, u32, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
let level = NonZeroU8::new(*level)?;
|
||||
let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?;
|
||||
let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?;
|
||||
Some((field_id, level, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for FacetLevelValueU32Codec {
|
||||
type EItem = (FieldId, NonZeroU8, u32, u32);
|
||||
|
||||
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut buffer = [0u8; 16];
|
||||
|
||||
// Write the big-endian integers.
|
||||
let bytes = left.to_be_bytes();
|
||||
buffer[..4].copy_from_slice(&bytes[..]);
|
||||
|
||||
let bytes = right.to_be_bytes();
|
||||
buffer[4..8].copy_from_slice(&bytes[..]);
|
||||
|
||||
// Then the u32 values just to be able to read them back.
|
||||
let bytes = left.to_be_bytes();
|
||||
buffer[8..12].copy_from_slice(&bytes[..]);
|
||||
|
||||
let bytes = right.to_be_bytes();
|
||||
buffer[12..].copy_from_slice(&bytes[..]);
|
||||
|
||||
let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(level.get());
|
||||
bytes.extend_from_slice(&buffer);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
50
milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
Normal file
50
milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
Normal file
@ -0,0 +1,50 @@
|
||||
use std::borrow::Cow;
|
||||
use std::str;
|
||||
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
/// A codec that stores the field id, level 0, and facet string.
|
||||
///
|
||||
/// It can only be used to encode the facet string of the level 0,
|
||||
/// as it hardcodes the level.
|
||||
///
|
||||
/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys,
|
||||
/// and make sure that the levels are not mixed-up. The level 0 is special, the key
|
||||
/// are strings, other levels represent groups and keys are simply two integers.
|
||||
pub struct FacetStringLevelZeroCodec;
|
||||
|
||||
impl FacetStringLevelZeroCodec {
|
||||
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
|
||||
out.reserve(value.len() + 2);
|
||||
out.extend_from_slice(&field_id.to_be_bytes());
|
||||
out.push(0); // the level zero (for LMDB ordering only)
|
||||
out.extend_from_slice(value.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec {
|
||||
type DItem = (FieldId, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
|
||||
if *level != 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let value = str::from_utf8(bytes).ok()?;
|
||||
Some((field_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec {
|
||||
type EItem = (FieldId, &'a str);
|
||||
|
||||
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::new();
|
||||
FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::{marker, str};
|
||||
|
||||
use super::try_split_at;
|
||||
|
||||
/// A codec that encodes a string in front of the value.
|
||||
///
|
||||
/// The usecase is for the facet string levels algorithm where we must know the
|
||||
/// original string of a normalized facet value, the original values are stored
|
||||
/// in the value to not break the lexicographical ordering of the LMDB keys.
|
||||
pub struct FacetStringLevelZeroValueCodec<C>(marker::PhantomData<C>);
|
||||
|
||||
impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||
where
|
||||
C: heed::BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (&'a str, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (string_len, bytes) = try_split_at(bytes, 2)?;
|
||||
let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
|
||||
let (string, bytes) = try_split_at(bytes, string_len as usize)?;
|
||||
let string = str::from_utf8(string).ok()?;
|
||||
|
||||
C::bytes_decode(bytes).map(|item| (string, item))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||
where
|
||||
C: heed::BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (&'a str, C::EItem);
|
||||
|
||||
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let string_len: u16 = string.len().try_into().ok()?;
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
|
||||
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
|
||||
bytes.extend_from_slice(&string_len.to_be_bytes());
|
||||
bytes.extend_from_slice(string.as_bytes());
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::types::Unit;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmaps() {
|
||||
let string = "abc";
|
||||
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
|
||||
let key = (string, docids.clone());
|
||||
let bytes =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap();
|
||||
let (out_string, out_docids) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_string, out_docids), (string, docids));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserialize_unit() {
|
||||
let string = "def";
|
||||
let key = (string, ());
|
||||
let bytes = FacetStringLevelZeroValueCodec::<Unit>::bytes_encode(&key).unwrap();
|
||||
let (out_string, out_unit) =
|
||||
FacetStringLevelZeroValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_string, out_unit), (string, ()));
|
||||
}
|
||||
}
|
@ -0,0 +1,114 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::{marker, str};
|
||||
|
||||
use super::try_split_at;
|
||||
|
||||
/// A codec that optionally encodes two strings in front of the value.
|
||||
///
|
||||
/// The usecase is for the facet string levels algorithm where we must
|
||||
/// know the origin of a group, the group left and right bounds are stored
|
||||
/// in the value to not break the lexicographical ordering of the LMDB keys.
|
||||
pub struct FacetStringZeroBoundsValueCodec<C>(marker::PhantomData<C>);
|
||||
|
||||
impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec<C>
|
||||
where
|
||||
C: heed::BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (Option<(&'a str, &'a str)>, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (contains_bounds, bytes) = bytes.split_first()?;
|
||||
|
||||
if *contains_bounds != 0 {
|
||||
let (left_len, bytes) = try_split_at(bytes, 2)?;
|
||||
let (right_len, bytes) = try_split_at(bytes, 2)?;
|
||||
|
||||
let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
|
||||
let (left, bytes) = try_split_at(bytes, left_len as usize)?;
|
||||
let (right, bytes) = try_split_at(bytes, right_len as usize)?;
|
||||
|
||||
let left = str::from_utf8(left).ok()?;
|
||||
let right = str::from_utf8(right).ok()?;
|
||||
|
||||
C::bytes_decode(bytes).map(|item| (Some((left, right)), item))
|
||||
} else {
|
||||
C::bytes_decode(bytes).map(|item| (None, item))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec<C>
|
||||
where
|
||||
C: heed::BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (Option<(&'a str, &'a str)>, C::EItem);
|
||||
|
||||
fn bytes_encode((bounds, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
match bounds {
|
||||
Some((left, right)) => {
|
||||
bytes.push(u8::max_value());
|
||||
|
||||
if left.is_empty() || right.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let left_len: u16 = left.len().try_into().ok()?;
|
||||
let right_len: u16 = right.len().try_into().ok()?;
|
||||
|
||||
bytes.extend_from_slice(&left_len.to_be_bytes());
|
||||
bytes.extend_from_slice(&right_len.to_be_bytes());
|
||||
|
||||
bytes.extend_from_slice(left.as_bytes());
|
||||
bytes.extend_from_slice(right.as_bytes());
|
||||
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
None => {
|
||||
bytes.push(0);
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::types::Unit;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmaps() {
|
||||
let bounds = Some(("abc", "def"));
|
||||
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
|
||||
let key = (bounds, docids.clone());
|
||||
let bytes =
|
||||
FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap();
|
||||
let (out_bounds, out_docids) =
|
||||
FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_bounds, out_docids), (bounds, docids));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserialize_unit() {
|
||||
let bounds = Some(("abc", "def"));
|
||||
let key = (bounds, ());
|
||||
let bytes = FacetStringZeroBoundsValueCodec::<Unit>::bytes_encode(&key).unwrap();
|
||||
let (out_bounds, out_unit) =
|
||||
FacetStringZeroBoundsValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_bounds, out_unit), (bounds, ()));
|
||||
}
|
||||
}
|
@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec {
|
||||
pub fn serialize_into(
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
value: &str,
|
||||
normalized_value: &str,
|
||||
out: &mut Vec<u8>,
|
||||
) {
|
||||
out.reserve(2 + 4 + value.len());
|
||||
out.reserve(2 + 4 + normalized_value.len());
|
||||
out.extend_from_slice(&field_id.to_be_bytes());
|
||||
out.extend_from_slice(&document_id.to_be_bytes());
|
||||
out.extend_from_slice(value.as_bytes());
|
||||
out.extend_from_slice(normalized_value.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = str::from_utf8(bytes).ok()?;
|
||||
Some((field_id, document_id, value))
|
||||
let normalized_value = str::from_utf8(bytes).ok()?;
|
||||
Some((field_id, document_id, normalized_value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec {
|
||||
type EItem = (FieldId, DocumentId, &'a str);
|
||||
|
||||
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::new();
|
||||
FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes);
|
||||
FieldDocIdFacetStringCodec::serialize_into(
|
||||
*field_id,
|
||||
*document_id,
|
||||
normalized_value,
|
||||
&mut bytes,
|
||||
);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,25 @@
|
||||
mod facet_level_value_f64_codec;
|
||||
mod facet_value_string_codec;
|
||||
mod facet_level_value_u32_codec;
|
||||
mod facet_string_level_zero_codec;
|
||||
mod facet_string_level_zero_value_codec;
|
||||
mod facet_string_zero_bounds_value_codec;
|
||||
mod field_doc_id_facet_f64_codec;
|
||||
mod field_doc_id_facet_string_codec;
|
||||
|
||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
||||
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
||||
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
||||
pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec;
|
||||
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
||||
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
||||
if slice.len() >= mid {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
@ -11,8 +11,8 @@ use roaring::RoaringBitmap;
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
|
||||
FieldDocIdFacetStringCodec,
|
||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::{
|
||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||
@ -90,13 +90,14 @@ pub struct Index {
|
||||
|
||||
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
||||
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the string with the docids that corresponds to it.
|
||||
pub facet_id_string_docids: Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
||||
pub facet_id_string_docids:
|
||||
Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||
|
||||
/// Maps the document id, the facet field id and the numbers.
|
||||
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
||||
/// Maps the document id, the facet field id and the strings.
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Unit>,
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||
|
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
|
||||
use crate::search::facet::FacetIter;
|
||||
use crate::search::facet::FacetNumberIter;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
@ -172,8 +172,11 @@ fn facet_ordered<'t>(
|
||||
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
|
||||
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
|
||||
} else {
|
||||
let facet_fn =
|
||||
if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing };
|
||||
let facet_fn = if ascending {
|
||||
FacetNumberIter::new_reducing
|
||||
} else {
|
||||
FacetNumberIter::new_reverse_reducing
|
||||
};
|
||||
let iter = facet_fn(rtxn, index, field_id, candidates)?;
|
||||
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::types::ByteSlice;
|
||||
use concat_arrays::concat_arrays;
|
||||
use heed::types::{ByteSlice, Str, Unit};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Distinct, DocIter};
|
||||
@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> {
|
||||
|
||||
impl<'a> FacetDistinctIter<'a> {
|
||||
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
|
||||
self.index
|
||||
.facet_id_string_docids
|
||||
.get(self.txn, &(self.distinct, key))
|
||||
.map(|result| result.map(|(_original, docids)| docids))
|
||||
}
|
||||
|
||||
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
|
||||
@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> {
|
||||
}
|
||||
|
||||
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
let mut key = [0; FID_SIZE + DOCID_SIZE];
|
||||
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
|
||||
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
|
||||
key
|
||||
concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
fn facet_number_values<'a>(
|
||||
@ -127,7 +128,7 @@ fn facet_number_values<'a>(
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
txn: &'a heed::RoTxn,
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> {
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, Unit>> {
|
||||
let key = facet_values_prefix_key(distinct, id);
|
||||
|
||||
let iter = index
|
||||
@ -144,14 +145,14 @@ fn facet_string_values<'a>(
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
txn: &'a heed::RoTxn,
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> {
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, Str>> {
|
||||
let key = facet_values_prefix_key(distinct, id);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_strings
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type::<FieldDocIdFacetStringCodec>();
|
||||
.remap_types::<FieldDocIdFacetStringCodec, Str>();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ mod test {
|
||||
|
||||
let mut documents = Vec::new();
|
||||
|
||||
let txts = ["toto", "titi", "tata"];
|
||||
let txts = ["Toto", "Titi", "Tata"];
|
||||
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||
|
||||
@ -90,7 +90,6 @@ mod test {
|
||||
|
||||
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
addition.update_format(UpdateFormat::Json);
|
||||
|
||||
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
|
||||
|
||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||
|
@ -1,16 +1,17 @@
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::ops::Bound::Unbounded;
|
||||
use std::{cmp, fmt};
|
||||
use std::{cmp, fmt, mem};
|
||||
|
||||
use heed::types::{ByteSlice, Unit};
|
||||
use heed::{BytesDecode, Database};
|
||||
use heed::types::ByteSlice;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::{FieldIdMapMissingEntry, UserError};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::FacetValueStringCodec;
|
||||
use crate::search::facet::{FacetIter, FacetRange};
|
||||
use crate::{DocumentId, FieldId, Index, Result};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// The default number of values by facets that will
|
||||
/// be fetched from the key-value store.
|
||||
@ -22,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 1000;
|
||||
const CANDIDATES_THRESHOLD: u64 = 3000;
|
||||
|
||||
pub struct FacetDistribution<'a> {
|
||||
facets: Option<HashSet<String>>,
|
||||
@ -67,48 +68,65 @@ impl<'a> FacetDistribution<'a> {
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
fn fetch_facet_values<'t, KC, K: 't>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<KC, Unit>,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()>
|
||||
where
|
||||
K: fmt::Display,
|
||||
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
|
||||
{
|
||||
match facet_type {
|
||||
FacetType::Number => {
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
||||
|
||||
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
|
||||
key_buffer.truncate(1);
|
||||
let distribution_prelength = distribution.len();
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
for docid in candidates.into_iter() {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &key_buffer)?
|
||||
.remap_key_type::<KC>();
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, value), ()) = result?;
|
||||
*distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||
if distribution.len() - distribution_prelength == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
FacetType::String => {
|
||||
let mut normalized_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
||||
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
for docid in candidates.into_iter() {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetStringCodec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, normalized_value), original_value) = result?;
|
||||
let (_, count) = normalized_distribution
|
||||
.entry(normalized_value)
|
||||
.or_insert_with(|| (original_value, 0));
|
||||
*count += 1;
|
||||
|
||||
if normalized_distribution.len() == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let iter = normalized_distribution
|
||||
.into_iter()
|
||||
.map(|(_normalized, (original, count))| (original.to_string(), count));
|
||||
distribution.extend(iter);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
match facet_type {
|
||||
FacetType::Number => {
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
|
||||
}
|
||||
FacetType::String => {
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// There is too much documents, we use the facet levels to move throught
|
||||
/// the facet values, to find the candidates and values associated.
|
||||
fn facet_numbers_distribution_from_facet_levels(
|
||||
@ -118,7 +136,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let iter =
|
||||
FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
||||
FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
||||
|
||||
for result in iter {
|
||||
let (value, mut docids) = result?;
|
||||
@ -134,6 +152,29 @@ impl<'a> FacetDistribution<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn facet_strings_distribution_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut BTreeMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let iter =
|
||||
FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
||||
|
||||
for result in iter {
|
||||
let (_normalized, original, mut docids) = result?;
|
||||
docids &= candidates;
|
||||
if !docids.is_empty() {
|
||||
distribution.insert(original.to_string(), docids.len());
|
||||
}
|
||||
if distribution.len() == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
|
||||
/// facet values one by one and iterate on the facet level 0 for numbers.
|
||||
fn facet_values_from_raw_facet_database(
|
||||
@ -143,7 +184,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
let mut distribution = BTreeMap::new();
|
||||
|
||||
let db = self.index.facet_id_f64_docids;
|
||||
let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
|
||||
let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
|
||||
|
||||
for result in range {
|
||||
let ((_, _, value, _), docids) = result?;
|
||||
@ -158,16 +199,22 @@ impl<'a> FacetDistribution<'a> {
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
|
||||
.remap_key_type::<FacetValueStringCodec>();
|
||||
.remap_key_type::<FacetStringLevelZeroCodec>();
|
||||
|
||||
let mut normalized_distribution = BTreeMap::new();
|
||||
for result in iter {
|
||||
let ((_, value), docids) = result?;
|
||||
distribution.insert(value.to_string(), docids.len());
|
||||
let ((_, normalized_value), (original_value, docids)) = result?;
|
||||
normalized_distribution.insert(normalized_value, (original_value, docids.len()));
|
||||
if distribution.len() == self.max_values_by_facet {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let iter = normalized_distribution
|
||||
.into_iter()
|
||||
.map(|(_normalized, (original, count))| (original.to_string(), count));
|
||||
distribution.extend(iter);
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
|
||||
@ -198,14 +245,12 @@ impl<'a> FacetDistribution<'a> {
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
self.facet_distribution_from_documents(
|
||||
self.facet_strings_distribution_from_facet_levels(
|
||||
field_id,
|
||||
String,
|
||||
candidates,
|
||||
&mut distribution,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
None => self.facet_values_from_raw_facet_database(field_id),
|
||||
|
248
milli/src/search/facet/facet_number.rs
Normal file
248
milli/src/search/facet/facet_number.rs
Normal file
@ -0,0 +1,248 @@
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
|
||||
use either::Either::{self, Left, Right};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{Database, LazyDecode, RoRange, RoRevRange};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::{FieldId, Index};
|
||||
|
||||
pub struct FacetNumberRange<'t> {
|
||||
iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<f64>,
|
||||
}
|
||||
|
||||
impl<'t> FacetNumberRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetNumberRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
Unbounded => Included((field_id, level, f64::MIN, f64::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, f64::MAX, f64::MAX));
|
||||
let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetNumberRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetNumberRange<'t> {
|
||||
type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok(docids) => Some(Ok(((fid, level, left, right), docids))),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetNumberRevRange<'t> {
|
||||
iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<f64>,
|
||||
}
|
||||
|
||||
impl<'t> FacetNumberRevRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetNumberRevRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
Unbounded => Included((field_id, level, f64::MIN, f64::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, f64::MAX, f64::MAX));
|
||||
let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetNumberRevRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetNumberRevRange<'t> {
|
||||
type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok(docids) => return Some(Ok(((fid, level, left, right), docids))),
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Some(Err(e)) => return Some(Err(e)),
|
||||
None => return None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetNumberIter<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level_iters: Vec<(RoaringBitmap, Either<FacetNumberRange<'t>, FacetNumberRevRange<'t>>)>,
|
||||
must_reduce: bool,
|
||||
}
|
||||
|
||||
impl<'t> FacetNumberIter<'t> {
|
||||
/// Create a `FacetNumberIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetNumberIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
/// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reverse_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetNumberIter<'t>> {
|
||||
let db = index.facet_id_f64_docids;
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Right(highest_iter))];
|
||||
Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
/// Create a `FacetNumberIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will not reduce the given documents ids
|
||||
/// while iterating on the different facet levels, possibly returning multiple times
|
||||
/// a document id associated with multiple facet values.
|
||||
pub fn new_non_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetNumberIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false })
|
||||
}
|
||||
|
||||
fn highest_level<X>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, X>,
|
||||
fid: FieldId,
|
||||
) -> heed::Result<Option<u8>> {
|
||||
let level = db
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.prefix_iter(rtxn, &fid.to_be_bytes())?
|
||||
.remap_key_type::<FacetLevelValueF64Codec>()
|
||||
.last()
|
||||
.transpose()?
|
||||
.map(|((_, level, _, _), _)| level);
|
||||
Ok(level)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetNumberIter<'t> {
|
||||
type Item = heed::Result<(f64, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, last) = self.level_iters.last_mut()?;
|
||||
let is_ascending = last.is_left();
|
||||
for result in last {
|
||||
// If the last iterator must find an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
match result {
|
||||
Ok(((_fid, level, left, right), mut docids)) => {
|
||||
docids &= &*documents_ids;
|
||||
if !docids.is_empty() {
|
||||
if self.must_reduce {
|
||||
*documents_ids -= &docids;
|
||||
}
|
||||
|
||||
if level == 0 {
|
||||
return Some(Ok((left, docids)));
|
||||
}
|
||||
|
||||
let rtxn = self.rtxn;
|
||||
let db = self.db;
|
||||
let fid = self.field_id;
|
||||
let left = Included(left);
|
||||
let right = Included(right);
|
||||
|
||||
let result = if is_ascending {
|
||||
FacetNumberRange::new(rtxn, db, fid, level - 1, left, right)
|
||||
.map(Left)
|
||||
} else {
|
||||
FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right)
|
||||
.map(Right)
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(iter) => {
|
||||
self.level_iters.push((docids, iter));
|
||||
continue 'outer;
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
self.level_iters.pop();
|
||||
}
|
||||
}
|
||||
}
|
406
milli/src/search/facet/facet_string.rs
Normal file
406
milli/src/search/facet/facet_string.rs
Normal file
@ -0,0 +1,406 @@
|
||||
//! This module contains helpers iterators for facet strings.
|
||||
//!
|
||||
//! The purpose is to help iterate over the quite complex system of facets strings. A simple
|
||||
//! description of the system would be that every facet string value is stored into an LMDB database
|
||||
//! and that every value is associated with the document ids which are associated with this facet
|
||||
//! string value.
|
||||
//!
|
||||
//! In reality it is a little bit more complex as we have to create aggregations of runs of facet
|
||||
//! string values, those aggregations helps in choosing the right groups of facets to follow.
|
||||
//!
|
||||
//! ## A typical algorithm run
|
||||
//!
|
||||
//! If a group of aggregated facets values contains one of the documents ids, we must continue
|
||||
//! iterating over the sub-groups.
|
||||
//!
|
||||
//! If this group is the lowest level and contain at least one document id we yield the associated
|
||||
//! facet documents ids.
|
||||
//!
|
||||
//! If the group doesn't contain one of our documents ids, we continue to the next group at this
|
||||
//! same level.
|
||||
//!
|
||||
//! ## The complexity comes from the strings
|
||||
//!
|
||||
//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create
|
||||
//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the
|
||||
//! two numbers bounds, the left and the right bound of the group, both inclusive.
|
||||
//!
|
||||
//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and
|
||||
//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values
|
||||
//! are simple unions of the documents ids coming from the groups below.
|
||||
//!
|
||||
//! ### Example of what a facet number LMDB database contain
|
||||
//!
|
||||
//! | level | left-bound | right-bound | documents ids |
|
||||
//! |-------|------------|-------------|------------------|
|
||||
//! | 0 | 0 | _skipped_ | 1, 2 |
|
||||
//! | 0 | 1 | _skipped_ | 6, 7 |
|
||||
//! | 0 | 3 | _skipped_ | 4, 7 |
|
||||
//! | 0 | 5 | _skipped_ | 2, 3, 4 |
|
||||
//! | 1 | 0 | 1 | 1, 2, 6, 7 |
|
||||
//! | 1 | 3 | 5 | 2, 3, 4, 7 |
|
||||
//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 |
|
||||
//!
|
||||
//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second
|
||||
//! bound, that's the base level where you can directly fetch the documents ids associated with an
|
||||
//! exact number.
|
||||
//!
|
||||
//! The next levels have two different bounds and the associated documents ids are simply the result
|
||||
//! of an union of all the documents ids associated with the aggregated groups above.
|
||||
//!
|
||||
//! ## The complexity of defining groups for facet strings
|
||||
//!
|
||||
//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in
|
||||
//! lexicographical order, it means that whatever the key represent the bytes are read in their raw
|
||||
//! form and a simple `strcmp` will define the order in which keys will be read from the store.
|
||||
//!
|
||||
//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and
|
||||
//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the
|
||||
//! first number then by the second if the the first number is equal on two keys.
|
||||
//!
|
||||
//! For strings it is a lot more complex as those types are unsized, it means that the size of facet
|
||||
//! strings is different for each facet value.
|
||||
//!
|
||||
//! ### Basic approach: padding the keys
|
||||
//!
|
||||
//! A first approach would be to simply define the maximum size of a facet string and pad the keys
|
||||
//! with zeroes. The big problem of this approach is that it:
|
||||
//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the
|
||||
//! other.
|
||||
//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB
|
||||
//! performances.
|
||||
//!
|
||||
//! ### Better approach: number the facet groups
|
||||
//!
|
||||
//! A better approach would be to number the groups, this way we don't have the downsides of the
|
||||
//! previously described approach but we need to be able to describe the groups by using a number.
|
||||
//!
|
||||
//! #### Example of facet strings with numbered groups
|
||||
//!
|
||||
//! | level | left-bound | right-bound | left-string | right-string | documents ids |
|
||||
//! |-------|------------|-------------|-------------|--------------|------------------|
|
||||
//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 |
|
||||
//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 |
|
||||
//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 |
|
||||
//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 |
|
||||
//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 |
|
||||
//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 |
|
||||
//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 |
|
||||
//!
|
||||
//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not
|
||||
//! need to store the facet string value two times.
|
||||
//!
|
||||
//! The number in the left-bound and right-bound columns are incremental numbers representing the
|
||||
//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering
|
||||
//! of the LMDB keys.
|
||||
//!
|
||||
//! In the value, not in the key, you can see that we added two new values: the left-string and the
|
||||
//! right-string, which defines the original facet strings associated with the given group.
|
||||
//!
|
||||
//! We put those two strings inside of the value, this way we do not limit the maximum size of the
|
||||
//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big
|
||||
//! values on another page, this helps in iterating over keys fast enough and only fetch the page
|
||||
//! with the values when required.
|
||||
//!
|
||||
//! The other little advantage with this solution is that there is no a big overhead, compared with
|
||||
//! the facet number levels, we only duplicate the facet strings once for the level 1.
|
||||
//!
|
||||
//! #### A typical algorithm run
|
||||
//!
|
||||
//! Note that the algorithm is always moving from the highest level to the lowest one, one level
|
||||
//! by one level, this is why it is ok to only store the facets string on the level 1.
|
||||
//!
|
||||
//! If a group of aggregated facets values, a group with numbers contains one of the documents ids,
|
||||
//! we must continue iterating over the sub-groups. To do so:
|
||||
//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds
|
||||
//! and iterate over the facet groups defined by these numbers over the current level - 1.
|
||||
//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the
|
||||
//! value and just do the same as with the facet numbers but with strings: iterate over the
|
||||
//! current level - 1 with both keys.
|
||||
//!
|
||||
//! If this group is the lowest level (level 0) and contain at least one document id we yield the
|
||||
//! associated facet documents ids.
|
||||
//!
|
||||
//! If the group doesn't contain one of our documents ids, we continue to the next group at this
|
||||
//! same level.
|
||||
//!
|
||||
|
||||
use std::num::NonZeroU8;
|
||||
use std::ops::Bound;
|
||||
use std::ops::Bound::{Excluded, Included, Unbounded};
|
||||
|
||||
use either::{Either, Left, Right};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{Database, LazyDecode, RoRange};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
FacetStringZeroBoundsValueCodec,
|
||||
};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::{FieldId, Index};
|
||||
|
||||
/// An iterator that is used to explore the facets level strings
|
||||
/// from the level 1 to infinity.
|
||||
///
|
||||
/// It yields the level, group id that an entry covers, the optional group strings
|
||||
/// that it covers of the level 0 only if it is an entry from the level 1 and
|
||||
/// the roaring bitmap associated.
|
||||
pub struct FacetStringGroupRange<'t> {
|
||||
iter: RoRange<
|
||||
't,
|
||||
FacetLevelValueU32Codec,
|
||||
LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
|
||||
>,
|
||||
end: Bound<u32>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringGroupRange<'t> {
|
||||
pub fn new<X, Y>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<X, Y>,
|
||||
field_id: FieldId,
|
||||
level: NonZeroU8,
|
||||
left: Bound<u32>,
|
||||
right: Bound<u32>,
|
||||
) -> heed::Result<FacetStringGroupRange<'t>> {
|
||||
let db = db.remap_types::<
|
||||
FacetLevelValueU32Codec,
|
||||
FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
|
||||
>();
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, u32::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
|
||||
Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
|
||||
let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetStringGroupRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetStringGroupRange<'t> {
|
||||
type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((_fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that is used to explore the level 0 of the facets string database.
|
||||
///
|
||||
/// It yields the facet string and the roaring bitmap associated with it.
|
||||
pub struct FacetStringLevelZeroRange<'t> {
|
||||
iter: RoRange<
|
||||
't,
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringLevelZeroRange<'t> {
|
||||
pub fn new<X, Y>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<X, Y>,
|
||||
field_id: FieldId,
|
||||
left: Bound<&str>,
|
||||
right: Bound<&str>,
|
||||
) -> heed::Result<FacetStringLevelZeroRange<'t>> {
|
||||
fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(value.as_bytes());
|
||||
&buffer[..]
|
||||
}
|
||||
|
||||
let mut left_buffer = Vec::new();
|
||||
let left_bound = match left {
|
||||
Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
|
||||
Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
|
||||
Unbounded => {
|
||||
left_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
left_buffer.push(0);
|
||||
Included(&left_buffer[..])
|
||||
}
|
||||
};
|
||||
|
||||
let mut right_buffer = Vec::new();
|
||||
let right_bound = match right {
|
||||
Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
|
||||
Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
|
||||
Unbounded => {
|
||||
right_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
right_buffer.push(1); // we must only get the level 0
|
||||
Excluded(&right_buffer[..])
|
||||
}
|
||||
};
|
||||
|
||||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.range(rtxn, &(left_bound, right_bound))?
|
||||
.remap_types::<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
|
||||
>();
|
||||
|
||||
Ok(FacetStringLevelZeroRange { iter })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
||||
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((_fid, normalized), (original, docids)))) => {
|
||||
Some(Ok((normalized, original, docids)))
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that is used to explore the facet strings level by level,
|
||||
/// it will only return facets strings that are associated with the
|
||||
/// candidates documents ids given.
|
||||
pub struct FacetStringIter<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: Database<ByteSlice, ByteSlice>,
|
||||
field_id: FieldId,
|
||||
level_iters:
|
||||
Vec<(RoaringBitmap, Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>)>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringIter<'t> {
|
||||
pub fn new_non_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetStringIter<'t>> {
|
||||
let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter = match NonZeroU8::new(highest_level) {
|
||||
Some(highest_level) => Left(FacetStringGroupRange::new(
|
||||
rtxn,
|
||||
index.facet_id_string_docids,
|
||||
field_id,
|
||||
highest_level,
|
||||
Unbounded,
|
||||
Unbounded,
|
||||
)?),
|
||||
None => Right(FacetStringLevelZeroRange::new(
|
||||
rtxn,
|
||||
index.facet_id_string_docids,
|
||||
field_id,
|
||||
Unbounded,
|
||||
Unbounded,
|
||||
)?),
|
||||
};
|
||||
|
||||
Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] })
|
||||
}
|
||||
|
||||
fn highest_level<X, Y>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<X, Y>,
|
||||
fid: FieldId,
|
||||
) -> heed::Result<Option<u8>> {
|
||||
Ok(db
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits
|
||||
.last()
|
||||
.transpose()?
|
||||
.map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetStringIter<'t> {
|
||||
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, last) = self.level_iters.last_mut()?;
|
||||
match last {
|
||||
Left(last) => {
|
||||
for result in last {
|
||||
match result {
|
||||
Ok(((level, left, right), (string_bounds, mut docids))) => {
|
||||
docids &= &*documents_ids;
|
||||
if !docids.is_empty() {
|
||||
*documents_ids -= &docids;
|
||||
|
||||
let result = match string_bounds {
|
||||
Some((left, right)) => FacetStringLevelZeroRange::new(
|
||||
self.rtxn,
|
||||
self.db,
|
||||
self.field_id,
|
||||
Included(left),
|
||||
Included(right),
|
||||
)
|
||||
.map(Right),
|
||||
None => FacetStringGroupRange::new(
|
||||
self.rtxn,
|
||||
self.db,
|
||||
self.field_id,
|
||||
NonZeroU8::new(level.get() - 1).unwrap(),
|
||||
Included(left),
|
||||
Included(right),
|
||||
)
|
||||
.map(Left),
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(iter) => {
|
||||
self.level_iters.push((docids, iter));
|
||||
continue 'outer;
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
Right(last) => {
|
||||
// level zero only
|
||||
for result in last {
|
||||
match result {
|
||||
Ok((normalized, original, mut docids)) => {
|
||||
docids &= &*documents_ids;
|
||||
if !docids.is_empty() {
|
||||
*documents_ids -= &docids;
|
||||
return Some(Ok((normalized, original, docids)));
|
||||
}
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.level_iters.pop();
|
||||
}
|
||||
}
|
||||
}
|
@ -15,9 +15,11 @@ use roaring::RoaringBitmap;
|
||||
use self::FilterCondition::*;
|
||||
use self::Operator::*;
|
||||
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
|
||||
use super::FacetRange;
|
||||
use super::FacetNumberRange;
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
@ -282,7 +284,7 @@ impl FilterCondition {
|
||||
|
||||
// We must create a custom iterator to be able to iterate over the
|
||||
// requested range as the range iterator cannot express some conditions.
|
||||
let iter = FacetRange::new(rtxn, db, field_id, level, left, right)?;
|
||||
let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?;
|
||||
|
||||
debug!("Iterating between {:?} and {:?} (level {})", left, right, level);
|
||||
|
||||
@ -363,7 +365,10 @@ impl FilterCondition {
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
|
||||
strings_db: heed::Database<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
field_id: FieldId,
|
||||
operator: &Operator,
|
||||
) -> Result<RoaringBitmap> {
|
||||
@ -374,7 +379,8 @@ impl FilterCondition {
|
||||
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
|
||||
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
|
||||
Equal(number, string) => {
|
||||
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
|
||||
let (_original_value, string_docids) =
|
||||
strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
|
||||
let number_docids = match number {
|
||||
Some(n) => {
|
||||
let n = Included(*n);
|
||||
|
@ -1,253 +1,11 @@
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
|
||||
use either::Either::{self, Left, Right};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{Database, LazyDecode, RoRange, RoRevRange};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::facet_distribution::FacetDistribution;
|
||||
pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange};
|
||||
pub use self::facet_string::FacetStringIter;
|
||||
pub use self::filter_condition::{FilterCondition, Operator};
|
||||
pub(crate) use self::parser::Rule as ParserRule;
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::{FieldId, Index};
|
||||
|
||||
mod facet_distribution;
|
||||
mod facet_number;
|
||||
mod facet_string;
|
||||
mod filter_condition;
|
||||
mod parser;
|
||||
|
||||
pub struct FacetRange<'t> {
|
||||
iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<f64>,
|
||||
}
|
||||
|
||||
impl<'t> FacetRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
Unbounded => Included((field_id, level, f64::MIN, f64::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, f64::MAX, f64::MAX));
|
||||
let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetRange<'t> {
|
||||
type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok(docids) => Some(Ok(((fid, level, left, right), docids))),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetRevRange<'t> {
|
||||
iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode<CboRoaringBitmapCodec>>,
|
||||
end: Bound<f64>,
|
||||
}
|
||||
|
||||
impl<'t> FacetRevRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level: u8,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
) -> heed::Result<FacetRevRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, f64::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
|
||||
Unbounded => Included((field_id, level, f64::MIN, f64::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, f64::MAX, f64::MAX));
|
||||
let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetRevRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetRevRange<'t> {
|
||||
type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok(docids) => return Some(Ok(((fid, level, left, right), docids))),
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Some(Err(e)) => return Some(Err(e)),
|
||||
None => return None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetIter<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
level_iters: Vec<(RoaringBitmap, Either<FacetRange<'t>, FacetRevRange<'t>>)>,
|
||||
must_reduce: bool,
|
||||
}
|
||||
|
||||
impl<'t> FacetIter<'t> {
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries in reverse
|
||||
/// (facet value + documents ids) and that will reduce the given documents ids
|
||||
/// while iterating on the different facet levels.
|
||||
pub fn new_reverse_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Right(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
||||
}
|
||||
|
||||
/// Create a `FacetIter` that will iterate on the different facet entries
|
||||
/// (facet value + documents ids) and that will not reduce the given documents ids
|
||||
/// while iterating on the different facet levels, possibly returning multiple times
|
||||
/// a document id associated with multiple facet values.
|
||||
pub fn new_non_reducing(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
index: &'t Index,
|
||||
field_id: FieldId,
|
||||
documents_ids: RoaringBitmap,
|
||||
) -> heed::Result<FacetIter<'t>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
|
||||
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
||||
let highest_iter =
|
||||
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
||||
let level_iters = vec![(documents_ids, Left(highest_iter))];
|
||||
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
|
||||
}
|
||||
|
||||
fn highest_level<X>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetLevelValueF64Codec, X>,
|
||||
fid: FieldId,
|
||||
) -> heed::Result<Option<u8>> {
|
||||
let level = db
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.prefix_iter(rtxn, &fid.to_be_bytes())?
|
||||
.remap_key_type::<FacetLevelValueF64Codec>()
|
||||
.last()
|
||||
.transpose()?
|
||||
.map(|((_, level, _, _), _)| level);
|
||||
Ok(level)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetIter<'t> {
|
||||
type Item = heed::Result<(f64, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, last) = self.level_iters.last_mut()?;
|
||||
let is_ascending = last.is_left();
|
||||
for result in last {
|
||||
// If the last iterator must find an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
match result {
|
||||
Ok(((_fid, level, left, right), mut docids)) => {
|
||||
docids &= &*documents_ids;
|
||||
if !docids.is_empty() {
|
||||
if self.must_reduce {
|
||||
*documents_ids -= &docids;
|
||||
}
|
||||
|
||||
if level == 0 {
|
||||
return Some(Ok((left, docids)));
|
||||
}
|
||||
|
||||
let rtxn = self.rtxn;
|
||||
let db = self.db;
|
||||
let fid = self.field_id;
|
||||
let left = Included(left);
|
||||
let right = Included(right);
|
||||
|
||||
let result = if is_ascending {
|
||||
FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left)
|
||||
} else {
|
||||
FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right)
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(iter) => {
|
||||
self.level_iters.push((docids, iter));
|
||||
continue 'outer;
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
self.level_iters.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub(crate) use self::facet::ParserRule;
|
||||
pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator};
|
||||
pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator};
|
||||
pub use self::matching_words::MatchingWords;
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
|
@ -3,12 +3,13 @@ use std::collections::HashMap;
|
||||
|
||||
use chrono::Utc;
|
||||
use fst::IntoStreamer;
|
||||
use heed::types::{ByteSlice, Unit};
|
||||
use heed::types::ByteSlice;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::ClearDocuments;
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
||||
@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_field_id_value_docids(
|
||||
remove_docids_from_facet_field_id_number_docids(
|
||||
self.wtxn,
|
||||
facet_id_f64_docids,
|
||||
&self.documents_ids,
|
||||
)?;
|
||||
|
||||
remove_docids_from_facet_field_id_value_docids(
|
||||
remove_docids_from_facet_field_id_string_docids(
|
||||
self.wtxn,
|
||||
facet_id_string_docids,
|
||||
&self.documents_ids,
|
||||
@ -419,15 +420,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
|
||||
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, Unit>,
|
||||
db: &heed::Database<C, DC>,
|
||||
field_id: FieldId,
|
||||
to_remove: &RoaringBitmap,
|
||||
convert: F,
|
||||
) -> heed::Result<()>
|
||||
where
|
||||
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
|
||||
C: heed::BytesDecode<'a, DItem = K>,
|
||||
DC: heed::BytesDecode<'a, DItem = V>,
|
||||
F: Fn(K) -> DocumentId,
|
||||
{
|
||||
let mut iter = db
|
||||
@ -436,7 +438,7 @@ where
|
||||
.remap_key_type::<C>();
|
||||
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, ()) = result?;
|
||||
let (key, _) = result?;
|
||||
if to_remove.contains(convert(key)) {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
@ -446,7 +448,33 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_docids_from_facet_field_id_value_docids<'a, C>(
|
||||
fn remove_docids_from_facet_field_id_string_docids<'a, C>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||
to_remove: &RoaringBitmap,
|
||||
) -> heed::Result<()>
|
||||
where
|
||||
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
|
||||
{
|
||||
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, (original_value, mut docids)) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= to_remove;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let bytes = bytes.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&bytes, &(original_value, docids))? };
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_docids_from_facet_field_id_number_docids<'a, C>(
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::cmp;
|
||||
use std::fs::File;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::num::{NonZeroU8, NonZeroUsize};
|
||||
use std::{cmp, mem};
|
||||
|
||||
use chrono::Utc;
|
||||
use grenad::{CompressionType, FileFuse, Reader, Writer};
|
||||
@ -10,7 +10,10 @@ use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::heed_codec::facet::FacetLevelValueF64Codec;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec,
|
||||
};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::index_documents::{
|
||||
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||
@ -64,24 +67,42 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
|
||||
for field_id in faceted_fields {
|
||||
// Clear the facet string levels.
|
||||
clear_field_string_levels(
|
||||
self.wtxn,
|
||||
self.index.facet_id_string_docids.remap_types::<ByteSlice, DecodeIgnore>(),
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
// Compute and store the faceted strings documents ids.
|
||||
let string_documents_ids = compute_faceted_documents_ids(
|
||||
let string_documents_ids = compute_faceted_strings_documents_ids(
|
||||
self.wtxn,
|
||||
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
let facet_string_levels = compute_facet_string_levels(
|
||||
self.wtxn,
|
||||
self.index.facet_id_string_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
// Clear the facet number levels.
|
||||
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
|
||||
|
||||
// Compute and store the faceted numbers documents ids.
|
||||
let number_documents_ids = compute_faceted_documents_ids(
|
||||
let number_documents_ids = compute_faceted_numbers_documents_ids(
|
||||
self.wtxn,
|
||||
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
let content = compute_facet_number_levels(
|
||||
let facet_number_levels = compute_facet_number_levels(
|
||||
self.wtxn,
|
||||
self.index.facet_id_f64_docids,
|
||||
self.chunk_compression_type,
|
||||
@ -106,8 +127,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.facet_id_f64_docids.as_polymorph(),
|
||||
content,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }),
|
||||
facet_number_levels,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }),
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.facet_id_string_docids.as_polymorph(),
|
||||
facet_string_levels,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }),
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
}
|
||||
@ -193,21 +222,6 @@ fn compute_facet_number_levels<'t>(
|
||||
writer_into_reader(writer, shrink_size)
|
||||
}
|
||||
|
||||
fn compute_faceted_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
|
||||
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||
let (_key, docids) = result?;
|
||||
documents_ids |= docids;
|
||||
}
|
||||
|
||||
Ok(documents_ids)
|
||||
}
|
||||
|
||||
fn write_number_entry(
|
||||
writer: &mut Writer<File>,
|
||||
field_id: FieldId,
|
||||
@ -222,3 +236,129 @@ fn write_number_entry(
|
||||
writer.insert(&key, &data)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_faceted_strings_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||
field_id: FieldId,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||
let (_key, (_original_value, docids)) = result?;
|
||||
documents_ids |= docids;
|
||||
}
|
||||
|
||||
Ok(documents_ids)
|
||||
}
|
||||
|
||||
fn compute_faceted_numbers_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||
let (_key, docids) = result?;
|
||||
documents_ids |= docids;
|
||||
}
|
||||
|
||||
Ok(documents_ids)
|
||||
}
|
||||
|
||||
fn clear_field_string_levels<'t>(
|
||||
wtxn: &'t mut heed::RwTxn,
|
||||
db: heed::Database<ByteSlice, DecodeIgnore>,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<()> {
|
||||
let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN);
|
||||
let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX);
|
||||
let range = left..=right;
|
||||
db.remap_key_type::<FacetLevelValueU32Codec>().delete_range(wtxn, &range).map(drop)
|
||||
}
|
||||
|
||||
fn compute_facet_string_levels<'t>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: heed::Database<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
compression_type: CompressionType,
|
||||
compression_level: Option<u32>,
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
field_id: FieldId,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let first_level_size = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &field_id.to_be_bytes())?
|
||||
.remap_types::<DecodeIgnore, DecodeIgnore>()
|
||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||
|
||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||
let mut writer = tempfile::tempfile()
|
||||
.and_then(|file| create_writer(compression_type, compression_level, file))?;
|
||||
|
||||
// Groups sizes are always a power of the original level_group_size and therefore a group
|
||||
// always maps groups of the previous level and never splits previous levels groups in half.
|
||||
let group_size_iter = (1u8..)
|
||||
.map(|l| (l, level_group_size.get().pow(l as u32)))
|
||||
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
|
||||
|
||||
for (level, group_size) in group_size_iter {
|
||||
let level = NonZeroU8::new(level).unwrap();
|
||||
let mut left = (0, "");
|
||||
let mut right = (0, "");
|
||||
let mut group_docids = RoaringBitmap::new();
|
||||
|
||||
// Because we know the size of the level 0 we can use a range iterator that starts
|
||||
// at the first value of the level and goes to the last by simply counting.
|
||||
for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() {
|
||||
let ((_field_id, value), (_original_value, docids)) = result?;
|
||||
|
||||
if i == 0 {
|
||||
left = (i as u32, value);
|
||||
} else if i % group_size == 0 {
|
||||
// we found the first bound of the next group, we must store the left
|
||||
// and right bounds associated with the docids. We also reset the docids.
|
||||
let docids = mem::take(&mut group_docids);
|
||||
write_string_entry(&mut writer, field_id, level, left, right, docids)?;
|
||||
|
||||
// We save the left bound for the new group.
|
||||
left = (i as u32, value);
|
||||
}
|
||||
|
||||
// The right bound is always the bound we run through.
|
||||
group_docids |= docids;
|
||||
right = (i as u32, value);
|
||||
}
|
||||
|
||||
if !group_docids.is_empty() {
|
||||
let docids = mem::take(&mut group_docids);
|
||||
write_string_entry(&mut writer, field_id, level, left, right, docids)?;
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer, shrink_size)
|
||||
}
|
||||
|
||||
fn write_string_entry(
|
||||
writer: &mut Writer<File>,
|
||||
field_id: FieldId,
|
||||
level: NonZeroU8,
|
||||
(left_id, left_value): (u32, &str),
|
||||
(right_id, right_value): (u32, &str),
|
||||
docids: RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let key = (field_id, level, left_id, right_id);
|
||||
let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?;
|
||||
let data = match level.get() {
|
||||
1 => (Some((left_value, right_value)), docids),
|
||||
_ => (None, docids),
|
||||
};
|
||||
let data = FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
|
||||
.ok_or(Error::Encoding)?;
|
||||
writer.insert(&key, &data)?;
|
||||
Ok(())
|
||||
}
|
||||
|
@ -2,8 +2,11 @@ use std::borrow::Cow;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use fst::IntoStreamer;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::Result;
|
||||
|
||||
@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
/// Uses the FacetStringLevelZeroValueCodec to merge the values.
|
||||
pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
let (head_string, mut head_rb) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..])
|
||||
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||
|
||||
for value in tail {
|
||||
let (_string, rb) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..])
|
||||
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||
head_rb |= rb;
|
||||
}
|
||||
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb))
|
||||
.map(|cow| cow.into_owned())
|
||||
.ok_or(SerializationError::Encoding { db_name: None })
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
|
||||
|
@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use self::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
};
|
||||
use self::store::{Readers, Store};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
self.wtxn,
|
||||
*self.index.facet_id_string_docids.as_polymorph(),
|
||||
facet_field_strings_docids_readers,
|
||||
cbo_roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
|
@ -22,12 +22,13 @@ use tempfile::tempfile;
|
||||
|
||||
use super::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
};
|
||||
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
||||
use crate::error::{Error, InternalError, SerializationError};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
|
||||
FieldDocIdFacetStringCodec,
|
||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
@ -65,7 +66,7 @@ pub struct Store<'s, A> {
|
||||
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids_limit: usize,
|
||||
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
||||
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
|
||||
facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>,
|
||||
facet_field_value_docids_limit: usize,
|
||||
// MTBL parameters
|
||||
chunk_compression_type: CompressionType,
|
||||
@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
max_memory,
|
||||
);
|
||||
let facet_field_strings_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
@ -283,21 +284,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
fn insert_facet_string_values_docid(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
value: String,
|
||||
normalized_value: String,
|
||||
original_value: String,
|
||||
id: DocumentId,
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_strings_sorter;
|
||||
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
|
||||
if normalized_value.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let key = (field_id, value);
|
||||
let sorter = &mut self.field_id_docid_facet_strings_sorter;
|
||||
Self::write_field_id_docid_facet_string_value(
|
||||
sorter,
|
||||
field_id,
|
||||
id,
|
||||
&normalized_value,
|
||||
&original_value,
|
||||
)?;
|
||||
|
||||
let key = (field_id, normalized_value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_string_docids.get_refresh(&key) {
|
||||
Some(old) => {
|
||||
Some((_original_value, old)) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
self.facet_field_string_docids
|
||||
.insert(key, (original_value, RoaringBitmap::from_iter(Some(id))));
|
||||
// If the word docids just reached it's capacity we must make sure to remove
|
||||
// one element, this way next time we insert we doesn't grow the capacity.
|
||||
if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit {
|
||||
@ -359,7 +372,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
document_id: DocumentId,
|
||||
words_positions: &mut HashMap<String, SmallVec32<Position>>,
|
||||
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
|
||||
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
|
||||
facet_strings_values: &mut HashMap<FieldId, Vec<(String, String)>>,
|
||||
record: &[u8],
|
||||
) -> Result<()> {
|
||||
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
|
||||
@ -395,8 +408,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
// We store document_id associated with all the facet strings fields ids and values.
|
||||
for (field, values) in facet_strings_values.drain() {
|
||||
for value in values {
|
||||
self.insert_facet_string_values_docid(field, value, document_id)?;
|
||||
for (normalized, original) in values {
|
||||
self.insert_facet_string_values_docid(field, normalized, original, document_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -512,23 +525,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
|
||||
I: IntoIterator<Item = ((FieldId, String), (String, RoaringBitmap))>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut data_buffer = Vec::new();
|
||||
|
||||
for ((field_id, value), docids) in iter {
|
||||
for ((field_id, normalized_value), (original_value, docids)) in iter {
|
||||
key_buffer.clear();
|
||||
data_buffer.clear();
|
||||
|
||||
FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer);
|
||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
|
||||
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
|
||||
|
||||
let data = (original_value.as_str(), docids);
|
||||
let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
|
||||
.ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?;
|
||||
|
||||
if lmdb_key_valid_size(&key_buffer) {
|
||||
sorter.insert(&key_buffer, &data_buffer)?;
|
||||
sorter.insert(&key_buffer, &data)?;
|
||||
} else {
|
||||
warn!("facet value {:?} is too large to be saved", value);
|
||||
warn!("facet value {:?} is too large to be saved", original_value);
|
||||
}
|
||||
}
|
||||
|
||||
@ -583,19 +597,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
value: &str,
|
||||
normalized_value: &str,
|
||||
original_value: &str,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer);
|
||||
FieldDocIdFacetStringCodec::serialize_into(
|
||||
field_id,
|
||||
document_id,
|
||||
normalized_value,
|
||||
&mut buffer,
|
||||
);
|
||||
|
||||
if lmdb_key_valid_size(&buffer) {
|
||||
sorter.insert(&buffer, &[])?;
|
||||
sorter.insert(&buffer, original_value.as_bytes())?;
|
||||
} else {
|
||||
warn!("facet value {:?} is too large to be saved", value);
|
||||
warn!("facet value {:?} is too large to be saved", original_value);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -925,24 +944,24 @@ fn process_tokens<'a>(
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<String>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push(b.to_string()),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(string) => {
|
||||
let string = string.trim().to_lowercase();
|
||||
output_strings.push(string);
|
||||
Value::String(original) => {
|
||||
let normalized = original.trim().to_lowercase();
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
|
@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
match self.searchable_fields {
|
||||
Setting::Set(ref fields) => {
|
||||
// every time the searchable attributes are updated, we need to update the
|
||||
// ids for any settings that uses the facets. (displayed_fields,
|
||||
// filterable_fields)
|
||||
// ids for any settings that uses the facets. (distinct_fields, filterable_fields).
|
||||
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
let mut new_fields_ids_map = FieldsIdsMap::new();
|
||||
|
Loading…
Reference in New Issue
Block a user