Move the facet extraction to dedicated modules

This commit is contained in:
Clément Renault 2024-09-05 10:32:22 +02:00
parent 34f11e3380
commit 0fc02f7351
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 271 additions and 237 deletions

View File

@ -0,0 +1,137 @@
use std::collections::HashSet;
use heed::RoTxn;
use serde_json::Value;
use super::FacetedExtractor;
use crate::facet::value_encoding::f64_into_bytes;
use crate::{normalize_facet, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
pub struct FieldIdFacetNumberDocidsExtractor;
impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let number = value.as_number()?;
let n = number.as_f64()?;
let ordered = f64_into_bytes(n)?;
// fid - level - orderedf64 - orignalf64
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(&ordered);
output.extend_from_slice(&n.to_be_bytes());
Some(&*output)
}
}
pub struct FieldIdFacetStringDocidsExtractor;
impl FacetedExtractor for FieldIdFacetStringDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let string = value.as_str()?;
let normalize = normalize_facet(string);
let truncated = truncate_str(&normalize);
// fid - level - normalized string
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(truncated.as_bytes());
Some(&*output)
}
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_str(s: &str) -> &str {
let index = s
.char_indices()
.map(|(idx, _)| idx)
.chain(std::iter::once(s.len()))
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
.last();
&s[..index.unwrap_or(0)]
}
pub struct FieldIdFacetIsNullDocidsExtractor;
impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
if value.is_null() {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
} else {
None
}
}
}
pub struct FieldIdFacetExistsDocidsExtractor;
impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
_value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
}
}
pub struct FieldIdFacetIsEmptyDocidsExtractor;
impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let is_empty = match value {
Value::Null | Value::Bool(_) | Value::Number(_) => false,
Value::String(s) => s.is_empty(),
Value::Array(a) => a.is_empty(),
Value::Object(o) => o.is_empty(),
};
if is_empty {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
} else {
None
}
}
}

View File

@ -0,0 +1,51 @@
use serde_json::Value;
use crate::update::new::KvReaderFieldId;
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
pub fn extract_document_facets(
attributes_to_extract: &[&str],
obkv: &KvReaderFieldId,
field_id_map: &mut GlobalFieldsIdsMap,
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
) -> Result<()> {
let mut field_name = String::new();
for (field_id, field_bytes) in obkv {
let Some(field_name) = field_id_map.name(field_id).map(|s| {
field_name.clear();
field_name.push_str(s);
&field_name
}) else {
unreachable!("field id not found in field id map");
};
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
Some(field_id) => facet_fn(field_id, value),
None => Err(UserError::AttributeLimitReached.into()),
};
// if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
// parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
&object,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, &value)?,
}
}
}
Ok(())
}

View File

@ -1,20 +1,19 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt::Debug;
use std::fs::File; use std::fs::File;
use grenad::Merger; use grenad::{MergeFunction, Merger};
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde_json::Value; use serde_json::Value;
use super::cache::CboCachedSorter; use super::cache::CboCachedSorter;
use super::perm_json_p; use crate::update::new::{DocumentChange, ItemsPool};
use crate::facet::value_encoding::f64_into_bytes;
use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId};
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::{ use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result};
normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError,
MAX_FACET_VALUE_LENGTH, mod extract_facets;
}; mod facet_document;
pub trait FacetedExtractor { pub trait FacetedExtractor {
fn run_extraction( fn run_extraction(
@ -74,6 +73,27 @@ pub trait FacetedExtractor {
Ok(builder.build()) Ok(builder.build())
} }
// TODO Shorten this
fn facet_fn_with_options<MF>(
buffer: &mut Vec<u8>,
cached_sorter: &mut CboCachedSorter<MF>,
cache_fn: impl Fn(&mut CboCachedSorter<MF>, &[u8], u32) -> grenad::Result<(), MF::Error>,
docid: DocumentId,
fid: FieldId,
value: &Value,
) -> Result<()>
where
MF: MergeFunction,
MF::Error: Debug,
{
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()),
None => Ok(()),
}
}
fn extract_document_change( fn extract_document_change(
rtxn: &RoTxn, rtxn: &RoTxn,
index: &Index, index: &Index,
@ -84,73 +104,69 @@ pub trait FacetedExtractor {
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
match document_change { match document_change {
DocumentChange::Deletion(inner) => { DocumentChange::Deletion(inner) => facet_document::extract_document_facets(
let mut facet_del_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract, attributes_to_extract,
inner.current(rtxn, index)?.unwrap(), inner.current(rtxn, index)?.unwrap(),
fields_ids_map, fields_ids_map,
&mut facet_del_fn, &mut |fid, value| {
Self::facet_fn_with_options(
buffer,
cached_sorter,
CboCachedSorter::insert_del_u32,
inner.docid(),
fid,
value,
) )
} },
),
DocumentChange::Update(inner) => { DocumentChange::Update(inner) => {
let mut facet_del_fn = |fid, value: &Value| -> Result<()> { facet_document::extract_document_facets(
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract, attributes_to_extract,
inner.current(rtxn, index)?.unwrap(), inner.current(rtxn, index)?.unwrap(),
fields_ids_map, fields_ids_map,
&mut facet_del_fn, &mut |fid, value| {
Self::facet_fn_with_options(
buffer,
cached_sorter,
CboCachedSorter::insert_del_u32,
inner.docid(),
fid,
value,
)
},
)?; )?;
let mut facet_add_fn = |fid, value: &Value| -> Result<()> { facet_document::extract_document_facets(
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract, attributes_to_extract,
inner.new(), inner.new(),
fields_ids_map, fields_ids_map,
&mut facet_add_fn, &mut |fid, value| {
Self::facet_fn_with_options(
buffer,
cached_sorter,
CboCachedSorter::insert_add_u32,
inner.docid(),
fid,
value,
)
},
) )
} }
DocumentChange::Insertion(inner) => { DocumentChange::Insertion(inner) => facet_document::extract_document_facets(
let mut facet_add_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract, attributes_to_extract,
inner.new(), inner.new(),
fields_ids_map, fields_ids_map,
&mut facet_add_fn, &mut |fid, value| {
Self::facet_fn_with_options(
buffer,
cached_sorter,
CboCachedSorter::insert_add_u32,
inner.docid(),
fid,
value,
) )
} },
),
} }
} }
@ -160,174 +176,3 @@ pub trait FacetedExtractor {
fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec<u8>) fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec<u8>)
-> Option<&'b [u8]>; -> Option<&'b [u8]>;
} }
pub struct FieldIdFacetNumberDocidsExtractor;
impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let number = value.as_number()?;
let n = number.as_f64()?;
let ordered = f64_into_bytes(n)?;
// fid - level - orderedf64 - orignalf64
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(&ordered);
output.extend_from_slice(&n.to_be_bytes());
Some(&*output)
}
}
pub struct FieldIdFacetStringDocidsExtractor;
impl FacetedExtractor for FieldIdFacetStringDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let string = value.as_str()?;
let normalize = normalize_facet(string);
let truncated = truncate_str(&normalize);
// fid - level - normalized string
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(truncated.as_bytes());
Some(&*output)
}
}
pub struct FieldIdFacetIsNullDocidsExtractor;
impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
if value.is_null() {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
} else {
None
}
}
}
pub struct FieldIdFacetExistsDocidsExtractor;
impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
_value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
}
}
pub struct FieldIdFacetIsEmptyDocidsExtractor;
impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let is_empty = match value {
Value::Null | Value::Bool(_) | Value::Number(_) => false,
Value::String(s) => s.is_empty(),
Value::Array(a) => a.is_empty(),
Value::Object(o) => o.is_empty(),
};
if is_empty {
output.extend_from_slice(&field_id.to_be_bytes());
Some(&*output)
} else {
None
}
}
}
pub fn extract_document_facets(
attributes_to_extract: &[&str],
obkv: &KvReaderFieldId,
field_id_map: &mut GlobalFieldsIdsMap,
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
) -> Result<()> {
let mut field_name = String::new();
for (field_id, field_bytes) in obkv {
let Some(field_name) = field_id_map.name(field_id).map(|s| {
field_name.clear();
field_name.push_str(s);
&field_name
}) else {
unreachable!("field id not found in field id map");
};
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
Some(field_id) => facet_fn(field_id, value),
None => Err(UserError::AttributeLimitReached.into()),
};
// if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
// parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
&object,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, &value)?,
}
}
}
Ok(())
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_str(s: &str) -> &str {
let index = s
.char_indices()
.map(|(idx, _)| idx)
.chain(std::iter::once(s.len()))
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
.last();
&s[..index.unwrap_or(0)]
}

View File

@ -2,11 +2,12 @@ mod cache;
mod faceted; mod faceted;
mod searchable; mod searchable;
pub use faceted::{ pub use faceted::modname::{
FacetedExtractor, FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor,
FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor,
FieldIdFacetStringDocidsExtractor, FieldIdFacetStringDocidsExtractor,
}; };
pub use faceted::FacetedExtractor;
pub use searchable::{ pub use searchable::{
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor, WordPositionDocidsExtractor,