diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 36408f578..178bb21c1 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec { pub fn serialize_into( field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, out: &mut Vec, ) { - out.reserve(2 + 4 + value.len()); + out.reserve(2 + 4 + normalized_value.len()); out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); + out.extend_from_slice(normalized_value.as_bytes()); } } @@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, value)) + let normalized_value = str::from_utf8(bytes).ok()?; + Some((field_id, document_id, normalized_value)) } } impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); + FieldDocIdFacetStringCodec::serialize_into( + *field_id, + *document_id, + normalized_value, + &mut bytes, + ); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index b2be10767..efc31ab46 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -96,7 +96,7 @@ pub struct Index { /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, /// Maps the document id, the facet field id and the strings. - pub field_id_docid_facet_strings: Database, + pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 91620da2a..d81f20732 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,6 +1,6 @@ use std::mem::size_of; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; use super::{Distinct, DocIter}; @@ -127,7 +127,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -144,14 +144,14 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index .field_id_docid_facet_strings .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_types::(); Ok(iter) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fef4ecc87..7c9acf276 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,15 +2,16 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; use std::{cmp, fmt, mem}; -use heed::types::{ByteSlice, Unit}; -use heed::{BytesDecode, Database}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetStringLevelZeroCodec; +use crate::heed_codec::facet::{ + FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, +}; use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; -use crate::{DocumentId, FieldId, Index, Result}; +use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -67,46 +68,55 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - fn fetch_facet_values<'t, KC, K: 't>( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - candidates: &RoaringBitmap, - distribution: &mut BTreeMap, - ) -> heed::Result<()> - where - K: fmt::Display, - KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - { - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - - for docid in candidates.into_iter() { - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = db - .remap_key_type::() - .prefix_iter(rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *distribution.entry(value.to_string()).or_insert(0) += 1; - } - } - - Ok(()) - } - match facet_type { FacetType::Number => { + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_f64s; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *distribution.entry(value.to_string()).or_insert(0) += 1; + } + } } FacetType::String => { + let mut normalized_distribution = BTreeMap::new(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_strings; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, normalized_value), original_value) = result?; + let (_, count) = normalized_distribution + .entry(normalized_value) + .or_insert_with(|| (original_value, 0)); + *count += 1; + } + } + + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); } } + + Ok(()) } /// There is too much documents, we use the facet levels to move throught @@ -227,7 +237,6 @@ impl<'a> FacetDistribution<'a> { &mut distribution, )?; } - Ok(distribution) } None => self.facet_values_from_raw_facet_database(field_id), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..e9c1e507a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; -use heed::types::{ByteSlice, Unit}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use serde_json::Value; @@ -419,15 +419,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, field_id: FieldId, to_remove: &RoaringBitmap, convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, + C: heed::BytesDecode<'a, DItem = K>, + DC: heed::BytesDecode<'a, DItem = V>, F: Fn(K) -> DocumentId, { let mut iter = db @@ -436,7 +437,7 @@ where .remap_key_type::(); while let Some(result) = iter.next() { - let (key, ()) = result?; + let (key, _) = result?; if to_remove.contains(convert(key)) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4c1071aab..1538295f9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -65,7 +65,7 @@ pub struct Store<'s, A> { LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -283,25 +283,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_facet_string_values_docid( &mut self, field_id: FieldId, - value: String, + normalized_value: String, + original_value: String, id: DocumentId, ) -> Result<()> { - if value.is_empty() { + if normalized_value.is_empty() { return Ok(()); } let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + Self::write_field_id_docid_facet_string_value( + sorter, + field_id, + id, + &normalized_value, + &original_value, + )?; - let key = (field_id, value); + let key = (field_id, normalized_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { + Some((_original_value, old)) => { old.insert(id); } None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_string_docids + .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { @@ -363,7 +371,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { document_id: DocumentId, words_positions: &mut HashMap>, facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. @@ -399,8 +407,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We store document_id associated with all the facet strings fields ids and values. for (field, values) in facet_strings_values.drain() { - for value in values { - self.insert_facet_string_values_docid(field, value, document_id)?; + for (normalized, original) in values { + self.insert_facet_string_values_docid(field, normalized, original, document_id)?; } } @@ -516,23 +524,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); - for ((field_id, value), docids) in iter { + for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); data_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); if lmdb_key_valid_size(&key_buffer) { sorter.insert(&key_buffer, &data_buffer)?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } } @@ -587,19 +595,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, + original_value: &str, ) -> Result<()> where Error: From, { let mut buffer = Vec::new(); - - FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + FieldDocIdFacetStringCodec::serialize_into( + field_id, + document_id, + normalized_value, + &mut buffer, + ); if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, &[])?; + sorter.insert(&buffer, original_value.as_bytes())?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } Ok(()) @@ -929,24 +942,24 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn extract_facet_values(value: &Value) -> (Vec, Vec) { +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { fn inner_extract_facet_values( value: &Value, can_recurse: bool, output_numbers: &mut Vec, - output_strings: &mut Vec, + output_strings: &mut Vec<(String, String)>, ) { match value { Value::Null => (), - Value::Bool(b) => output_strings.push(b.to_string()), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), Value::Number(number) => { if let Some(float) = number.as_f64() { output_numbers.push(float); } } - Value::String(string) => { - let string = string.trim().to_lowercase(); - output_strings.push(string); + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); } Value::Array(values) => { if can_recurse { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c6540b33a..e4adbccb9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (displayed_fields, - // filterable_fields) + // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_fields_ids_map = FieldsIdsMap::new();