Display the original facet string value from the linear facet database

This commit is contained in:
Kerollmops 2021-07-15 10:19:35 +02:00
parent d23c250ad5
commit 03a01166ba
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
7 changed files with 108 additions and 81 deletions

View File

@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec {
pub fn serialize_into( pub fn serialize_into(
field_id: FieldId, field_id: FieldId,
document_id: DocumentId, document_id: DocumentId,
value: &str, normalized_value: &str,
out: &mut Vec<u8>, out: &mut Vec<u8>,
) { ) {
out.reserve(2 + 4 + value.len()); out.reserve(2 + 4 + normalized_value.len());
out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(&document_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes());
out.extend_from_slice(value.as_bytes()); out.extend_from_slice(normalized_value.as_bytes());
} }
} }
@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes); let document_id = u32::from_be_bytes(document_id_bytes);
let value = str::from_utf8(bytes).ok()?; let normalized_value = str::from_utf8(bytes).ok()?;
Some((field_id, document_id, value)) Some((field_id, document_id, normalized_value))
} }
} }
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec {
type EItem = (FieldId, DocumentId, &'a str); type EItem = (FieldId, DocumentId, &'a str);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::new(); let mut bytes = Vec::new();
FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); FieldDocIdFacetStringCodec::serialize_into(
*field_id,
*document_id,
normalized_value,
&mut bytes,
);
Some(Cow::Owned(bytes)) Some(Cow::Owned(bytes))
} }
} }

View File

@ -96,7 +96,7 @@ pub struct Index {
/// Maps the document id, the facet field id and the numbers. /// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>, pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
/// Maps the document id, the facet field id and the strings. /// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Unit>, pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>, pub documents: Database<OwnedType<BEU32>, ObkvCodec>,

View File

@ -1,6 +1,6 @@
use std::mem::size_of; use std::mem::size_of;
use heed::types::ByteSlice; use heed::types::{ByteSlice, Str, Unit};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{Distinct, DocIter}; use super::{Distinct, DocIter};
@ -127,7 +127,7 @@ fn facet_number_values<'a>(
distinct: FieldId, distinct: FieldId,
index: &Index, index: &Index,
txn: &'a heed::RoTxn, txn: &'a heed::RoTxn,
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> { ) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, Unit>> {
let key = facet_values_prefix_key(distinct, id); let key = facet_values_prefix_key(distinct, id);
let iter = index let iter = index
@ -144,14 +144,14 @@ fn facet_string_values<'a>(
distinct: FieldId, distinct: FieldId,
index: &Index, index: &Index,
txn: &'a heed::RoTxn, txn: &'a heed::RoTxn,
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> { ) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, Str>> {
let key = facet_values_prefix_key(distinct, id); let key = facet_values_prefix_key(distinct, id);
let iter = index let iter = index
.field_id_docid_facet_strings .field_id_docid_facet_strings
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)? .prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetStringCodec>(); .remap_types::<FieldDocIdFacetStringCodec, Str>();
Ok(iter) Ok(iter)
} }

View File

@ -2,15 +2,16 @@ use std::collections::{BTreeMap, HashSet};
use std::ops::Bound::Unbounded; use std::ops::Bound::Unbounded;
use std::{cmp, fmt, mem}; use std::{cmp, fmt, mem};
use heed::types::{ByteSlice, Unit}; use heed::types::ByteSlice;
use heed::{BytesDecode, Database};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::error::{FieldIdMapMissingEntry, UserError};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::FacetStringLevelZeroCodec; use crate::heed_codec::facet::{
FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
};
use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter};
use crate::{DocumentId, FieldId, Index, Result}; use crate::{FieldId, Index, Result};
/// The default number of values by facets that will /// The default number of values by facets that will
/// be fetched from the key-value store. /// be fetched from the key-value store.
@ -67,46 +68,55 @@ impl<'a> FacetDistribution<'a> {
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>, distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> { ) -> heed::Result<()> {
fn fetch_facet_values<'t, KC, K: 't>( match facet_type {
rtxn: &'t heed::RoTxn, FacetType::Number => {
db: Database<KC, Unit>,
field_id: FieldId,
candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
where
K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
{
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
let db = self.index.field_id_docid_facet_f64s;
for docid in candidates.into_iter() { for docid in candidates.into_iter() {
key_buffer.truncate(mem::size_of::<FieldId>()); key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes()); key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db let iter = db
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &key_buffer)? .prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<KC>(); .remap_key_type::<FieldDocIdFacetF64Codec>();
for result in iter { for result in iter {
let ((_, _, value), ()) = result?; let ((_, _, value), ()) = result?;
*distribution.entry(value.to_string()).or_insert(0) += 1; *distribution.entry(value.to_string()).or_insert(0) += 1;
} }
} }
Ok(())
}
match facet_type {
FacetType::Number => {
let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
} }
FacetType::String => { FacetType::String => {
let mut normalized_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
let db = self.index.field_id_docid_facet_strings; let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) for docid in candidates.into_iter() {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
for result in iter {
let ((_, _, normalized_value), original_value) = result?;
let (_, count) = normalized_distribution
.entry(normalized_value)
.or_insert_with(|| (original_value, 0));
*count += 1;
} }
} }
let iter = normalized_distribution
.into_iter()
.map(|(_normalized, (original, count))| (original.to_string(), count));
distribution.extend(iter);
}
}
Ok(())
} }
/// There is too much documents, we use the facet levels to move throught /// There is too much documents, we use the facet levels to move throught
@ -227,7 +237,6 @@ impl<'a> FacetDistribution<'a> {
&mut distribution, &mut distribution,
)?; )?;
} }
Ok(distribution) Ok(distribution)
} }
None => self.facet_values_from_raw_facet_database(field_id), None => self.facet_values_from_raw_facet_database(field_id),

View File

@ -3,7 +3,7 @@ use std::collections::HashMap;
use chrono::Utc; use chrono::Utc;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, Unit}; use heed::types::ByteSlice;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
@ -419,15 +419,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
wtxn: &'a mut heed::RwTxn, wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, Unit>, db: &heed::Database<C, DC>,
field_id: FieldId, field_id: FieldId,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
convert: F, convert: F,
) -> heed::Result<()> ) -> heed::Result<()>
where where
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, C: heed::BytesDecode<'a, DItem = K>,
DC: heed::BytesDecode<'a, DItem = V>,
F: Fn(K) -> DocumentId, F: Fn(K) -> DocumentId,
{ {
let mut iter = db let mut iter = db
@ -436,7 +437,7 @@ where
.remap_key_type::<C>(); .remap_key_type::<C>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (key, ()) = result?; let (key, _) = result?;
if to_remove.contains(convert(key)) { if to_remove.contains(convert(key)) {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };

View File

@ -65,7 +65,7 @@ pub struct Store<'s, A> {
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize, words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>,
facet_field_value_docids_limit: usize, facet_field_value_docids_limit: usize,
// MTBL parameters // MTBL parameters
chunk_compression_type: CompressionType, chunk_compression_type: CompressionType,
@ -283,25 +283,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn insert_facet_string_values_docid( fn insert_facet_string_values_docid(
&mut self, &mut self,
field_id: FieldId, field_id: FieldId,
value: String, normalized_value: String,
original_value: String,
id: DocumentId, id: DocumentId,
) -> Result<()> { ) -> Result<()> {
if value.is_empty() { if normalized_value.is_empty() {
return Ok(()); return Ok(());
} }
let sorter = &mut self.field_id_docid_facet_strings_sorter; let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; Self::write_field_id_docid_facet_string_value(
sorter,
field_id,
id,
&normalized_value,
&original_value,
)?;
let key = (field_id, value); let key = (field_id, normalized_value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map. // if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) { match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { Some((_original_value, old)) => {
old.insert(id); old.insert(id);
} }
None => { None => {
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); self.facet_field_string_docids
.insert(key, (original_value, RoaringBitmap::from_iter(Some(id))));
// If the word docids just reached it's capacity we must make sure to remove // If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity. // one element, this way next time we insert we doesn't grow the capacity.
if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit {
@ -363,7 +371,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
document_id: DocumentId, document_id: DocumentId,
words_positions: &mut HashMap<String, SmallVec32<Position>>, words_positions: &mut HashMap<String, SmallVec32<Position>>,
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>, facet_strings_values: &mut HashMap<FieldId, Vec<(String, String)>>,
record: &[u8], record: &[u8],
) -> Result<()> { ) -> Result<()> {
// We compute the list of words pairs proximities (self-join) and write it directly to disk. // We compute the list of words pairs proximities (self-join) and write it directly to disk.
@ -399,8 +407,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// We store document_id associated with all the facet strings fields ids and values. // We store document_id associated with all the facet strings fields ids and values.
for (field, values) in facet_strings_values.drain() { for (field, values) in facet_strings_values.drain() {
for value in values { for (normalized, original) in values {
self.insert_facet_string_values_docid(field, value, document_id)?; self.insert_facet_string_values_docid(field, normalized, original, document_id)?;
} }
} }
@ -516,23 +524,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where where
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>, I: IntoIterator<Item = ((FieldId, String), (String, RoaringBitmap))>,
Error: From<E>, Error: From<E>,
{ {
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut data_buffer = Vec::new(); let mut data_buffer = Vec::new();
for ((field_id, value), docids) in iter { for ((field_id, normalized_value), (original_value, docids)) in iter {
key_buffer.clear(); key_buffer.clear();
data_buffer.clear(); data_buffer.clear();
FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
if lmdb_key_valid_size(&key_buffer) { if lmdb_key_valid_size(&key_buffer) {
sorter.insert(&key_buffer, &data_buffer)?; sorter.insert(&key_buffer, &data_buffer)?;
} else { } else {
warn!("facet value {:?} is too large to be saved", value); warn!("facet value {:?} is too large to be saved", original_value);
} }
} }
@ -587,19 +595,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
sorter: &mut Sorter<MergeFn<E>>, sorter: &mut Sorter<MergeFn<E>>,
field_id: FieldId, field_id: FieldId,
document_id: DocumentId, document_id: DocumentId,
value: &str, normalized_value: &str,
original_value: &str,
) -> Result<()> ) -> Result<()>
where where
Error: From<E>, Error: From<E>,
{ {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
FieldDocIdFacetStringCodec::serialize_into(
FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); field_id,
document_id,
normalized_value,
&mut buffer,
);
if lmdb_key_valid_size(&buffer) { if lmdb_key_valid_size(&buffer) {
sorter.insert(&buffer, &[])?; sorter.insert(&buffer, original_value.as_bytes())?;
} else { } else {
warn!("facet value {:?} is too large to be saved", value); warn!("facet value {:?} is too large to be saved", original_value);
} }
Ok(()) Ok(())
@ -929,24 +942,24 @@ fn process_tokens<'a>(
.filter(|(_, t)| t.is_word()) .filter(|(_, t)| t.is_word())
} }
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) { fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
fn inner_extract_facet_values( fn inner_extract_facet_values(
value: &Value, value: &Value,
can_recurse: bool, can_recurse: bool,
output_numbers: &mut Vec<f64>, output_numbers: &mut Vec<f64>,
output_strings: &mut Vec<String>, output_strings: &mut Vec<(String, String)>,
) { ) {
match value { match value {
Value::Null => (), Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()), Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
Value::Number(number) => { Value::Number(number) => {
if let Some(float) = number.as_f64() { if let Some(float) = number.as_f64() {
output_numbers.push(float); output_numbers.push(float);
} }
} }
Value::String(string) => { Value::String(original) => {
let string = string.trim().to_lowercase(); let normalized = original.trim().to_lowercase();
output_strings.push(string); output_strings.push((normalized, original.clone()));
} }
Value::Array(values) => { Value::Array(values) => {
if can_recurse { if can_recurse {

View File

@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.searchable_fields { match self.searchable_fields {
Setting::Set(ref fields) => { Setting::Set(ref fields) => {
// every time the searchable attributes are updated, we need to update the // every time the searchable attributes are updated, we need to update the
// ids for any settings that uses the facets. (displayed_fields, // ids for any settings that uses the facets. (distinct_fields, filterable_fields).
// filterable_fields)
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_fields_ids_map = FieldsIdsMap::new(); let mut new_fields_ids_map = FieldsIdsMap::new();