From 0fc02f7351fa7d46b0d75f615b07cb6509729b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 10:32:22 +0200 Subject: [PATCH] Move the facet extraction to dedicated modules --- .../new/extract/faceted/extract_facets.rs | 137 ++++++++ .../new/extract/faceted/facet_document.rs | 51 +++ milli/src/update/new/extract/faceted/mod.rs | 315 +++++------------- milli/src/update/new/extract/mod.rs | 5 +- 4 files changed, 271 insertions(+), 237 deletions(-) create mode 100644 milli/src/update/new/extract/faceted/extract_facets.rs create mode 100644 milli/src/update/new/extract/faceted/facet_document.rs diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs new file mode 100644 index 000000000..9471c753b --- /dev/null +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -0,0 +1,137 @@ +use std::collections::HashSet; + +use heed::RoTxn; +use serde_json::Value; + +use super::FacetedExtractor; +use crate::facet::value_encoding::f64_into_bytes; +use crate::{normalize_facet, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; + +pub struct FieldIdFacetNumberDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let number = value.as_number()?; + let n = number.as_f64()?; + let ordered = f64_into_bytes(n)?; + + // fid - level - orderedf64 - orignalf64 + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(&ordered); + output.extend_from_slice(&n.to_be_bytes()); + + Some(&*output) + } +} + +pub struct FieldIdFacetStringDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let string = value.as_str()?; + let normalize = normalize_facet(string); + let truncated = truncate_str(&normalize); + + // fid - level - normalized string + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(truncated.as_bytes()); + + Some(&*output) + } +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_str(s: &str) -> &str { + let index = s + .char_indices() + .map(|(idx, _)| idx) + .chain(std::iter::once(s.len())) + .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) + .last(); + + &s[..index.unwrap_or(0)] +} + +pub struct FieldIdFacetIsNullDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + if value.is_null() { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} + +pub struct FieldIdFacetExistsDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + _value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } +} + +pub struct FieldIdFacetIsEmptyDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let is_empty = match value { + Value::Null | Value::Bool(_) | Value::Number(_) => false, + Value::String(s) => s.is_empty(), + Value::Array(a) => a.is_empty(), + Value::Object(o) => o.is_empty(), + }; + + if is_empty { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} diff --git a/milli/src/update/new/extract/faceted/facet_document.rs b/milli/src/update/new/extract/faceted/facet_document.rs new file mode 100644 index 000000000..849fa8f29 --- /dev/null +++ b/milli/src/update/new/extract/faceted/facet_document.rs @@ -0,0 +1,51 @@ +use serde_json::Value; + +use crate::update::new::KvReaderFieldId; +use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; + +pub fn extract_document_facets( + attributes_to_extract: &[&str], + obkv: &KvReaderFieldId, + field_id_map: &mut GlobalFieldsIdsMap, + facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, +) -> Result<()> { + let mut field_name = String::new(); + for (field_id, field_bytes) in obkv { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { + Some(field_id) => facet_fn(field_id, value), + None => Err(UserError::AttributeLimitReached.into()), + }; + + // if the current field is searchable or contains a searchable attribute + if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + value => tokenize_field(field_name, &value)?, + } + } + } + + Ok(()) +} diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index b54219fd3..19aeb031c 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -1,20 +1,19 @@ use std::collections::HashSet; +use std::fmt::Debug; use std::fs::File; -use grenad::Merger; +use grenad::{MergeFunction, Merger}; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use serde_json::Value; use super::cache::CboCachedSorter; -use super::perm_json_p; -use crate::facet::value_encoding::f64_into_bytes; -use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId}; +use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{ - normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError, - MAX_FACET_VALUE_LENGTH, -}; +use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result}; + +mod extract_facets; +mod facet_document; pub trait FacetedExtractor { fn run_extraction( @@ -74,6 +73,27 @@ pub trait FacetedExtractor { Ok(builder.build()) } + // TODO Shorten this + fn facet_fn_with_options( + buffer: &mut Vec, + cached_sorter: &mut CboCachedSorter, + cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, + docid: DocumentId, + fid: FieldId, + value: &Value, + ) -> Result<()> + where + MF: MergeFunction, + MF::Error: Debug, + { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()), + None => Ok(()), + } + } + fn extract_document_change( rtxn: &RoTxn, index: &Index, @@ -84,73 +104,69 @@ pub trait FacetedExtractor { document_change: DocumentChange, ) -> Result<()> { match document_change { - DocumentChange::Deletion(inner) => { - let mut facet_del_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( - attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut facet_del_fn, - ) - } + DocumentChange::Deletion(inner) => facet_document::extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, + ), DocumentChange::Update(inner) => { - let mut facet_del_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( + facet_document::extract_document_facets( attributes_to_extract, inner.current(rtxn, index)?.unwrap(), fields_ids_map, - &mut facet_del_fn, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, )?; - let mut facet_add_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( + facet_document::extract_document_facets( attributes_to_extract, inner.new(), fields_ids_map, - &mut facet_add_fn, - ) - } - DocumentChange::Insertion(inner) => { - let mut facet_add_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( - attributes_to_extract, - inner.new(), - fields_ids_map, - &mut facet_add_fn, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, ) } + DocumentChange::Insertion(inner) => facet_document::extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, + ), } } @@ -160,174 +176,3 @@ pub trait FacetedExtractor { fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec) -> Option<&'b [u8]>; } - -pub struct FieldIdFacetNumberDocidsExtractor; -impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let number = value.as_number()?; - let n = number.as_f64()?; - let ordered = f64_into_bytes(n)?; - - // fid - level - orderedf64 - orignalf64 - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(&ordered); - output.extend_from_slice(&n.to_be_bytes()); - - Some(&*output) - } -} - -pub struct FieldIdFacetStringDocidsExtractor; -impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let string = value.as_str()?; - let normalize = normalize_facet(string); - let truncated = truncate_str(&normalize); - - // fid - level - normalized string - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(truncated.as_bytes()); - - Some(&*output) - } -} - -pub struct FieldIdFacetIsNullDocidsExtractor; -impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - if value.is_null() { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None - } - } -} - -pub struct FieldIdFacetExistsDocidsExtractor; -impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - _value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } -} - -pub struct FieldIdFacetIsEmptyDocidsExtractor; -impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let is_empty = match value { - Value::Null | Value::Bool(_) | Value::Number(_) => false, - Value::String(s) => s.is_empty(), - Value::Array(a) => a.is_empty(), - Value::Object(o) => o.is_empty(), - }; - - if is_empty { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None - } - } -} - -pub fn extract_document_facets( - attributes_to_extract: &[&str], - obkv: &KvReaderFieldId, - field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, -) -> Result<()> { - let mut field_name = String::new(); - for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id).map(|s| { - field_name.clear(); - field_name.push_str(s); - &field_name - }) else { - unreachable!("field id not found in field id map"); - }; - - let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { - Some(field_id) => facet_fn(field_id, value), - None => Err(UserError::AttributeLimitReached.into()), - }; - - // if the current field is searchable or contains a searchable attribute - if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { - // parse json. - match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( - &object, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - &mut tokenize_field, - )?, - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( - &array, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - &mut tokenize_field, - )?, - value => tokenize_field(field_name, &value)?, - } - } - } - - Ok(()) -} - -/// Truncates a string to the biggest valid LMDB key size. -fn truncate_str(s: &str) -> &str { - let index = s - .char_indices() - .map(|(idx, _)| idx) - .chain(std::iter::once(s.len())) - .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) - .last(); - - &s[..index.unwrap_or(0)] -} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 69081e251..e50e70c47 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,11 +2,12 @@ mod cache; mod faceted; mod searchable; -pub use faceted::{ - FacetedExtractor, FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, +pub use faceted::modname::{ + FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, FieldIdFacetStringDocidsExtractor, }; +pub use faceted::FacetedExtractor; pub use searchable::{ ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor,