2024-10-21 09:28:49 +02:00
|
|
|
use std::collections::{BTreeSet, HashMap};
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
use charabia::normalizer::NormalizerOption;
|
|
|
|
use charabia::{Language, Normalize, StrDetection, Token};
|
2024-10-21 09:28:49 +02:00
|
|
|
use grenad::Sorter;
|
2024-10-09 11:35:45 +02:00
|
|
|
use heed::types::{Bytes, SerdeJson};
|
|
|
|
use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn};
|
|
|
|
|
|
|
|
use super::channel::FacetSearchableSender;
|
|
|
|
use super::extract::FacetKind;
|
|
|
|
use super::fst_merger_builder::FstMergerBuilder;
|
|
|
|
use super::KvReaderDelAdd;
|
|
|
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
|
|
|
use crate::heed_codec::StrRefCodec;
|
|
|
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
|
|
|
use crate::update::{create_sorter, MergeDeladdBtreesetString};
|
2024-10-21 09:28:49 +02:00
|
|
|
use crate::{
|
|
|
|
BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
|
|
|
|
MAX_FACET_VALUE_LENGTH,
|
|
|
|
};
|
|
|
|
|
|
|
|
pub struct FacetSearchBuilder<'indexer> {
|
|
|
|
registered_facets: HashMap<FieldId, usize>,
|
|
|
|
normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
|
|
|
|
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
|
|
|
|
localized_attributes_rules: Vec<LocalizedAttributesRule>,
|
|
|
|
// Buffered data below
|
|
|
|
buffer: Vec<u8>,
|
|
|
|
localized_field_ids: HashMap<FieldId, Option<Vec<Language>>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'indexer> FacetSearchBuilder<'indexer> {
|
|
|
|
pub fn new(
|
|
|
|
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
|
|
|
|
localized_attributes_rules: Vec<LocalizedAttributesRule>,
|
|
|
|
) -> Self {
|
|
|
|
let registered_facets = HashMap::new();
|
|
|
|
let normalized_facet_string_docids_sorter = create_sorter(
|
|
|
|
grenad::SortAlgorithm::Stable,
|
|
|
|
MergeDeladdBtreesetString,
|
|
|
|
grenad::CompressionType::None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
Some(0),
|
2024-10-09 11:35:45 +02:00
|
|
|
false,
|
2024-10-21 09:28:49 +02:00
|
|
|
);
|
|
|
|
|
|
|
|
Self {
|
|
|
|
registered_facets,
|
|
|
|
normalized_facet_string_docids_sorter,
|
|
|
|
buffer: Vec::new(),
|
|
|
|
global_fields_ids_map,
|
|
|
|
localized_attributes_rules,
|
|
|
|
localized_field_ids: HashMap::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result<Option<FacetGroupKey<&'k str>>> {
|
|
|
|
match FacetKind::from(key[0]) {
|
|
|
|
// Only strings are searchable
|
|
|
|
FacetKind::String => Ok(Some(
|
|
|
|
FacetGroupKeyCodec::<StrRefCodec>::bytes_decode(&key[1..])
|
|
|
|
.map_err(heed::Error::Encoding)?,
|
|
|
|
)),
|
|
|
|
_ => Ok(None),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> {
|
|
|
|
let Some(FacetGroupKey { field_id, level: _level, left_bound }) =
|
|
|
|
self.extract_key_data(facet_key)?
|
|
|
|
else {
|
|
|
|
return Ok(());
|
|
|
|
};
|
|
|
|
|
|
|
|
if deladd == DelAdd::Addition {
|
|
|
|
self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
let locales = self.locales(field_id);
|
2024-10-09 11:35:45 +02:00
|
|
|
let hyper_normalized_value = normalize_facet_string(left_bound, locales);
|
2024-10-21 09:28:49 +02:00
|
|
|
|
|
|
|
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
|
|
|
|
|
|
|
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
|
|
|
self.buffer.clear();
|
|
|
|
let mut obkv = KvWriterDelAdd::new(&mut self.buffer);
|
|
|
|
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
|
|
|
obkv.insert(deladd, val)?;
|
|
|
|
obkv.finish()?;
|
|
|
|
|
|
|
|
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
|
|
|
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
|
|
|
self.normalized_facet_string_docids_sorter.insert(key_bytes, &self.buffer)?;
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
|
2024-10-09 11:35:45 +02:00
|
|
|
if !self.localized_field_ids.contains_key(&field_id) {
|
2024-10-21 09:28:49 +02:00
|
|
|
let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
|
|
|
|
unreachable!("Field id {} not found in the global fields ids map", field_id);
|
|
|
|
};
|
|
|
|
|
|
|
|
let locales = self
|
|
|
|
.localized_attributes_rules
|
|
|
|
.iter()
|
|
|
|
.find(|rule| rule.match_str(field_name))
|
|
|
|
.map(|rule| rule.locales.clone());
|
|
|
|
|
|
|
|
self.localized_field_ids.insert(field_id, locales);
|
|
|
|
}
|
|
|
|
|
|
|
|
self.localized_field_ids.get(&field_id).unwrap().as_deref()
|
|
|
|
}
|
|
|
|
|
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")]
|
|
|
|
pub fn merge_and_send(
|
|
|
|
self,
|
|
|
|
index: &Index,
|
2024-10-09 11:35:45 +02:00
|
|
|
wtxn: &mut RwTxn,
|
|
|
|
rtxn: &RoTxn,
|
2024-10-21 09:28:49 +02:00
|
|
|
sender: FacetSearchableSender,
|
|
|
|
) -> Result<()> {
|
|
|
|
let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
|
|
|
|
let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString);
|
|
|
|
builder.extend(reader);
|
|
|
|
|
|
|
|
let database = index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>();
|
|
|
|
|
|
|
|
let mut merger_iter = builder.build().into_stream_merger_iter()?;
|
|
|
|
let mut current_field_id = None;
|
|
|
|
let mut fst;
|
|
|
|
let mut fst_merger_builder: Option<FstMergerBuilder> = None;
|
|
|
|
while let Some((key, deladd)) = merger_iter.next()? {
|
|
|
|
let (field_id, normalized_facet_string) =
|
2024-10-09 11:35:45 +02:00
|
|
|
BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
|
2024-10-21 09:28:49 +02:00
|
|
|
|
|
|
|
if current_field_id != Some(field_id) {
|
|
|
|
if let Some(fst_merger_builder) = fst_merger_builder {
|
|
|
|
// send the previous fst to the channel
|
|
|
|
let mmap = fst_merger_builder.build(&mut callback)?;
|
2024-10-09 11:35:45 +02:00
|
|
|
// sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
|
|
|
|
todo!("What to do");
|
2024-10-21 09:28:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
println!("getting fst for field_id: {}", field_id);
|
|
|
|
fst = index.facet_id_string_fst.get(rtxn, &field_id)?;
|
|
|
|
fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?);
|
|
|
|
current_field_id = Some(field_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
let current = database.get(rtxn, key)?;
|
|
|
|
let deladd: &KvReaderDelAdd = deladd.into();
|
|
|
|
let del = deladd.get(DelAdd::Deletion);
|
|
|
|
let add = deladd.get(DelAdd::Addition);
|
|
|
|
|
|
|
|
match merge_btreesets(current, del, add)? {
|
|
|
|
Operation::Write(value) => {
|
|
|
|
match fst_merger_builder.as_mut() {
|
|
|
|
Some(fst_merger_builder) => {
|
|
|
|
fst_merger_builder.register(
|
|
|
|
DelAdd::Addition,
|
|
|
|
normalized_facet_string.as_bytes(),
|
|
|
|
&mut callback,
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
None => unreachable!(),
|
|
|
|
}
|
|
|
|
let key = (field_id, normalized_facet_string);
|
|
|
|
let key_bytes =
|
|
|
|
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
|
|
|
sender.write_facet(&key_bytes, &value).unwrap();
|
|
|
|
}
|
|
|
|
Operation::Delete => {
|
|
|
|
match fst_merger_builder.as_mut() {
|
|
|
|
Some(fst_merger_builder) => {
|
|
|
|
fst_merger_builder.register(
|
|
|
|
DelAdd::Deletion,
|
|
|
|
normalized_facet_string.as_bytes(),
|
|
|
|
&mut callback,
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
None => unreachable!(),
|
|
|
|
}
|
|
|
|
let key = (field_id, normalized_facet_string);
|
|
|
|
let key_bytes =
|
|
|
|
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
|
|
|
sender.delete_facet(&key_bytes).unwrap();
|
|
|
|
}
|
|
|
|
Operation::Ignore => (),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
|
|
|
|
let mmap = fst_merger_builder.build(&mut callback)?;
|
2024-10-09 11:35:45 +02:00
|
|
|
// sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
|
|
|
|
todo!("What to do");
|
2024-10-21 09:28:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> {
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
fn merge_btreesets(
|
2024-10-21 09:28:49 +02:00
|
|
|
current: Option<&[u8]>,
|
|
|
|
del: Option<&[u8]>,
|
|
|
|
add: Option<&[u8]>,
|
|
|
|
) -> Result<Operation> {
|
|
|
|
let mut result: BTreeSet<String> = match current {
|
|
|
|
Some(current) => SerdeJson::bytes_decode(current).map_err(heed::Error::Encoding)?,
|
|
|
|
None => BTreeSet::new(),
|
|
|
|
};
|
|
|
|
if let Some(del) = del {
|
|
|
|
let del: BTreeSet<String> = SerdeJson::bytes_decode(del).map_err(heed::Error::Encoding)?;
|
|
|
|
result = result.difference(&del).cloned().collect();
|
|
|
|
}
|
|
|
|
if let Some(add) = add {
|
|
|
|
let add: BTreeSet<String> = SerdeJson::bytes_decode(add).map_err(heed::Error::Encoding)?;
|
|
|
|
result.extend(add);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// TODO remove allocation
|
|
|
|
let result = SerdeJson::bytes_encode(&result).map_err(heed::Error::Encoding)?.into_owned();
|
|
|
|
if Some(result.as_ref()) == current {
|
|
|
|
Ok(Operation::Ignore)
|
|
|
|
} else if result.is_empty() {
|
|
|
|
Ok(Operation::Delete)
|
|
|
|
} else {
|
|
|
|
Ok(Operation::Write(result))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Normalizes the facet string and truncates it to the max length.
|
|
|
|
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
|
|
|
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
|
|
|
let mut detection = StrDetection::new(facet_string, locales);
|
|
|
|
|
|
|
|
let script = detection.script();
|
|
|
|
// Detect the language of the facet string only if several locales are explicitly provided.
|
|
|
|
let language = match locales {
|
|
|
|
Some(&[language]) => Some(language),
|
|
|
|
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
|
|
|
_ => None,
|
|
|
|
};
|
|
|
|
|
|
|
|
let token = Token {
|
|
|
|
lemma: std::borrow::Cow::Borrowed(facet_string),
|
|
|
|
script,
|
|
|
|
language,
|
|
|
|
..Default::default()
|
|
|
|
};
|
|
|
|
|
|
|
|
// truncate the facet string to the max length
|
|
|
|
token
|
|
|
|
.normalize(&options)
|
|
|
|
.lemma
|
|
|
|
.char_indices()
|
|
|
|
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
|
|
|
.map(|(_, c)| c)
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
enum Operation {
|
|
|
|
Write(Vec<u8>),
|
|
|
|
Delete,
|
|
|
|
Ignore,
|
|
|
|
}
|