diff --git a/Cargo.lock b/Cargo.lock index 63e30b65c..ddb2e9ec5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -550,6 +550,12 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matches" version = "0.1.8" @@ -608,10 +614,12 @@ dependencies = [ "levenshtein_automata", "linked-hash-map", "log", + "maplit", "memmap", "near-proximity", "obkv", "once_cell", + "ordered-float", "rayon", "ringtail", "roaring", @@ -708,6 +716,15 @@ version = "11.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" +[[package]] +name = "ordered-float" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" +dependencies = [ + "num-traits", +] + [[package]] name = "page_size" version = "0.4.2" diff --git a/Cargo.toml b/Cargo.toml index 441397275..fd453a5f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } obkv = "0.1.0" once_cell = "1.4.0" +ordered-float = "2.0.0" rayon = "1.3.1" ringtail = "0.3.0" roaring = "0.6.1" @@ -44,6 +45,7 @@ stderrlog = "0.5.0" [dev-dependencies] criterion = "0.3.3" +maplit = "1.0.2" [build-dependencies] fst = "0.4.4" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 7c7759e2e..23fed5bbe 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -989,6 +989,7 @@ dependencies = [ "near-proximity", "obkv", "once_cell", + "ordered-float", "rayon", "ringtail", "roaring", @@ -1205,6 +1206,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" +[[package]] +name = "ordered-float" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" +dependencies = [ + "num-traits", +] + [[package]] name = "page_size" version = "0.4.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2af2c9d90..094b2fb79 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; use std::path::PathBuf; @@ -210,6 +210,8 @@ enum UpdateMetaProgress { } #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] struct Settings { #[serde( default, @@ -224,6 +226,9 @@ struct Settings { skip_serializing_if = "Option::is_none", )] searchable_attributes: Option>>, + + #[serde(default)] + faceted_attributes: Option>, } // Any value that is present is considered Some value, including null. @@ -367,6 +372,11 @@ async fn main() -> anyhow::Result<()> { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(facet_types) = settings.faceted_attributes { + builder.set_faceted_fields(facet_types); + } + let result = builder.execute(|indexing_step| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), diff --git a/src/facet/facet_type.rs b/src/facet/facet_type.rs new file mode 100644 index 000000000..4fdc80798 --- /dev/null +++ b/src/facet/facet_type.rs @@ -0,0 +1,50 @@ +use std::error::Error; +use std::fmt; +use std::str::FromStr; + +use serde::{Serialize, Deserialize}; + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize)] +pub enum FacetType { + String, + Float, + Integer, +} + +impl fmt::Display for FacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FacetType::String => f.write_str("string"), + FacetType::Float => f.write_str("float"), + FacetType::Integer => f.write_str("integer"), + } + } +} + +impl FromStr for FacetType { + type Err = InvalidFacetType; + + fn from_str(s: &str) -> Result { + if s.eq_ignore_ascii_case("string") { + Ok(FacetType::String) + } else if s.eq_ignore_ascii_case("float") { + Ok(FacetType::Float) + } else if s.eq_ignore_ascii_case("integer") { + Ok(FacetType::Integer) + } else { + Err(InvalidFacetType) + } + } +} + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct InvalidFacetType; + +impl fmt::Display for InvalidFacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(r#"Invalid facet type, must be "string", "float" or "integer""#) + } +} + +impl Error for InvalidFacetType { } diff --git a/src/facet/mod.rs b/src/facet/mod.rs new file mode 100644 index 000000000..9ec99f2d3 --- /dev/null +++ b/src/facet/mod.rs @@ -0,0 +1,4 @@ +mod facet_type; +pub mod value_encoding; + +pub use self::facet_type::FacetType; diff --git a/src/facet/value_encoding.rs b/src/facet/value_encoding.rs new file mode 100644 index 000000000..3cb012a0e --- /dev/null +++ b/src/facet/value_encoding.rs @@ -0,0 +1,69 @@ +// https://stackoverflow.com/a/43305015/1941280 +#[inline] +pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { + if float.is_finite() { + if float == 0.0 || float == -0.0 { + return Some(xor_first_bit(0.0_f64.to_be_bytes())); + } else if float.is_sign_negative() { + return Some(xor_all_bits(float.to_be_bytes())); + } else if float.is_sign_positive() { + return Some(xor_first_bit(float.to_be_bytes())); + } + } + None +} + +#[inline] +pub fn i64_into_bytes(int: i64) -> [u8; 8] { + xor_first_bit(int.to_be_bytes()) +} + +#[inline] +pub fn i64_from_bytes(bytes: [u8; 8]) -> i64 { + i64::from_be_bytes(xor_first_bit(bytes)) +} + +#[inline] +fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] { + x[0] ^= 0x80; + x +} + +#[inline] +fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { + x.iter_mut().for_each(|b| *b ^= 0xff); + x +} + +#[cfg(test)] +mod tests { + use std::cmp::Ordering::Less; + use super::*; + + fn is_sorted(x: &[T]) -> bool { + x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less) + } + + #[test] + fn ordered_f64_bytes() { + let a = -13_f64; + let b = -10.0; + let c = -0.0; + let d = 1.0; + let e = 43.0; + + let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } + + #[test] + fn ordered_i64_bytes() { + let a = -10_i64; + let b = -0_i64; + let c = 1_i64; + let d = 43_i64; + + let vec: Vec<_> = [a, b, c, d].iter().cloned().map(i64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } +} diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs new file mode 100644 index 000000000..228514de5 --- /dev/null +++ b/src/heed_codec/facet/facet_value_f64_codec.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct FacetValueF64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { + type DItem = (u8, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, buffer) = bytes.split_first()?; + let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; + Some((*field_id, value)) + } +} + +impl heed::BytesEncode<'_> for FacetValueF64Codec { + type EItem = (u8, f64); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the globally ordered float. + let bytes = f64_into_bytes(*value)?; + buffer[..8].copy_from_slice(&bytes[..]); + + // Then the f64 value just to be able to read it back. + let bytes = value.to_be_bytes(); + buffer[8..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(&buffer[..]); + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::{BytesEncode, BytesDecode}; + use super::*; + + #[test] + fn globally_ordered_f64() { + let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); + let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, value), (3, -32.0)); + } +} diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs new file mode 100644 index 000000000..f99b8a3ea --- /dev/null +++ b/src/heed_codec/facet/facet_value_i64_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; + +pub struct FacetValueI64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { + type DItem = (u8, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, buffer) = bytes.split_first()?; + let value = buffer.try_into().map(i64_from_bytes).ok()?; + Some((*field_id, value)) + } +} + +impl heed::BytesEncode<'_> for FacetValueI64Codec { + type EItem = (u8, i64); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let value = i64_into_bytes(*value); + let mut bytes = Vec::with_capacity(value.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(&value[..]); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/src/heed_codec/facet/facet_value_string_codec.rs new file mode 100644 index 000000000..faa8b407b --- /dev/null +++ b/src/heed_codec/facet/facet_value_string_codec.rs @@ -0,0 +1,25 @@ +use std::borrow::Cow; +use std::str; + +pub struct FacetValueStringCodec; + +impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { + type DItem = (u8, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { + type EItem = (u8, &'a str); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(value.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(value.as_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs new file mode 100644 index 000000000..abe2c1d8a --- /dev/null +++ b/src/heed_codec/facet/mod.rs @@ -0,0 +1,7 @@ +mod facet_value_f64_codec; +mod facet_value_i64_codec; +mod facet_value_string_codec; + +pub use self::facet_value_f64_codec::FacetValueF64Codec; +pub use self::facet_value_i64_codec::FacetValueI64Codec; +pub use self::facet_value_string_codec::FacetValueStringCodec; diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index 68739fbf1..e7b8cf256 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -4,6 +4,7 @@ mod cbo_roaring_bitmap_codec; mod obkv_codec; mod roaring_bitmap_codec; mod str_str_u8_codec; +pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; diff --git a/src/index.rs b/src/index.rs index 1a0c5d4d6..68d7dfe5f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::path::Path; use anyhow::Context; @@ -6,9 +7,10 @@ use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; +use crate::facet::FacetType; +use crate::fields_ids_map::FieldsIdsMap; use crate::Search; use crate::{BEU32, DocumentId}; -use crate::fields_ids_map::FieldsIdsMap; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, @@ -16,6 +18,7 @@ use crate::{ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; +pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; @@ -34,22 +37,33 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, + /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. + pub facet_field_id_value_docids: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(5); + options.max_dbs(6); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; let word_docids = env.create_database(Some("word-docids"))?; let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; + let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let documents = env.create_database(Some("documents"))?; - Ok(Index { env, main, word_docids, docid_word_positions, word_pair_proximity_docids, documents }) + Ok(Index { + env, + main, + word_docids, + docid_word_positions, + word_pair_proximity_docids, + facet_field_id_value_docids, + documents, + }) } /// Create a write transaction to be able to write into the index. @@ -175,6 +189,24 @@ impl Index { self.main.get::<_, Str, ByteSlice>(rtxn, SEARCHABLE_FIELDS_KEY) } + /* faceted fields */ + + /// Writes the facet fields ids associated with their facet type or `None` if + /// the facet type is currently unknown. + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) + } + + /// Deletes the facet fields ids associated with their facet type. + pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) + } + + /// Returns the facet fields ids associated with their facet type. + pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result> { + Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. diff --git a/src/lib.rs b/src/lib.rs index bea05e68a..808c54a4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ mod index; mod mdfs; mod query_tokens; mod search; +pub mod facet; pub mod heed_codec; pub mod proximity; pub mod subcommand; @@ -33,6 +34,7 @@ pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; +pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type DocumentId = u32; @@ -60,9 +62,9 @@ pub fn obkv_to_json( } /// Transform a JSON value into a string that can be indexed. -pub fn json_to_string(value: Value) -> Option { +pub fn json_to_string(value: &Value) -> Option { - fn inner(value: Value, output: &mut String) -> bool { + fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { Value::Null => false, @@ -121,7 +123,7 @@ mod tests { "not_there": null, }); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); assert_eq!(string, "name: John Doe. age: 43. "); } @@ -135,7 +137,7 @@ mod tests { null, ]); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); // We don't care about having two point (.) after the other as // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 0a4dabeba..aa5cd3d7b 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -78,6 +78,16 @@ enum Command { words: Vec, }, + /// Outputs a CSV with the documents ids along with the facet values where it appears. + FacetValuesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + /// Outputs the total size of all the docid-word-positions keys and values. TotalDocidWordPositionsSize, @@ -147,6 +157,9 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), + FacetValuesDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, field_name) + }, TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -256,6 +269,64 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec anyhow::Result<()> { + use crate::facet::FacetType; + use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; + use heed::{BytesDecode, Error::Decoding}; + + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let faceted_fields = index.faceted_fields(&rtxn)?; + + let field_id = fields_ids_map.id(&field_name) + .with_context(|| format!("field {} not found", field_name))?; + let field_type = faceted_fields.get(&field_id) + .with_context(|| format!("field {} is not faceted", field_name))?; + + let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; + let iter = match field_type { + FacetType::String => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) as Box> + }, + FacetType::Float => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) + }, + FacetType::Integer => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) + }, + }; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["facet_value", "documents_ids"])?; + + for result in iter { + let (value, docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value, docids])?; + } + + Ok(wtr.flush()?) +} + fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { use std::fs::File; use std::io::Write as _; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index ae739bd0b..c49ae9104 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -18,33 +18,25 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids, docid_word_positions, word_pair_proximity_docids, + facet_field_id_value_docids, documents, } = self.index; - // We clear the word fst. + // We retrieve the number of documents ids that we are deleting. + let number_of_documents = self.index.number_of_documents(self.wtxn)?; + + // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - - // We clear the users ids documents ids. self.index.put_users_ids_documents_ids(self.wtxn, &fst::Map::default())?; - - // We retrieve the documents ids. - let documents_ids = self.index.documents_ids(self.wtxn)?; - - // We clear the internal documents ids. self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; - // We clear the word docids. + // Clear the other databases. word_docids.clear(self.wtxn)?; - - // We clear the docid word positions. docid_word_positions.clear(self.wtxn)?; - - // We clear the word pair proximity docids. word_pair_proximity_docids.clear(self.wtxn)?; - - // We clear the documents themselves. + facet_field_id_value_docids.clear(self.wtxn)?; documents.clear(self.wtxn)?; - Ok(documents_ids.len() as usize) + Ok(number_of_documents) } } diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index e3a3be15a..d68bca81c 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -76,6 +76,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, docid_word_positions, word_pair_proximity_docids, + facet_field_id_value_docids, documents, } = self.index; @@ -158,7 +159,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = words.iter().filter_map(|(w, d)| if *d { Some(w.as_ref()) } else { None }); + let words_to_delete = words.iter().filter_map(|(word, must_remove)| { + if *must_remove { Some(word.as_ref()) } else { None } + }); let words_to_delete = fst::Set::from_iter(words_to_delete)?; let new_words_fst = { @@ -191,6 +194,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + drop(iter); + + // We delete the documents ids that are under the facet field id values. + let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else { + iter.put_current(bytes, &docids)?; + } + } + Ok(self.documents_ids.len() as usize) } } diff --git a/src/update/index_documents/merge_function.rs b/src/update/index_documents/merge_function.rs index 7f91b6716..fb785fd11 100644 --- a/src/update/index_documents/merge_function.rs +++ b/src/update/index_documents/merge_function.rs @@ -29,23 +29,13 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); Ok(values[0].to_vec()) }, - DOCUMENTS_IDS_KEY => word_docids_merge(&[], values), + DOCUMENTS_IDS_KEY => roaring_bitmap_merge(values), otherwise => bail!("wut {:?}", otherwise), } } pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = RoaringBitmap::deserialize_from(&head[..])?; - - for value in tail { - let bitmap = RoaringBitmap::deserialize_from(&value[..])?; - head.union_with(&bitmap); - } - - let mut vec = Vec::with_capacity(head.serialized_size()); - head.serialize_into(&mut vec)?; - Ok(vec) + roaring_bitmap_merge(values) } pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -53,17 +43,11 @@ pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow:: } pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + cbo_roaring_bitmap_merge(values) +} - for value in tail { - let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?; - head.union_with(&bitmap); - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; - Ok(vec) +pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) } pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -85,3 +69,31 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu writer.finish().unwrap(); } + +fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { + let (head, tail) = values.split_first().unwrap(); + let mut head = RoaringBitmap::deserialize_from(&head[..])?; + + for value in tail { + let bitmap = RoaringBitmap::deserialize_from(&value[..])?; + head.union_with(&bitmap); + } + + let mut vec = Vec::with_capacity(head.serialized_size()); + head.serialize_into(&mut vec)?; + Ok(vec) +} + +fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { + let (head, tail) = values.split_first().unwrap(); + let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + + for value in tail { + let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?; + head.union_with(&bitmap); + } + + let mut vec = Vec::new(); + CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; + Ok(vec) +} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index f078ec9ed..13b725e19 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -16,10 +16,10 @@ use rayon::ThreadPool; use crate::index::Index; use crate::update::UpdateIndexingStep; -use self::store::Store; +use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, + docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -327,8 +327,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, + FacetValuesDocids, } + let faceted_fields = self.index.faceted_fields(self.wtxn)?; let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { Some(fields) => fields.iter().copied().collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(), @@ -362,6 +364,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .map(|(i, documents)| { let store = Store::new( searchable_fields.clone(), + faceted_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, @@ -384,13 +387,23 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut word_docids_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); + let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { - main_readers.push(readers.main); - word_docids_readers.push(readers.word_docids); - docid_word_positions_readers.push(readers.docid_word_positions); - words_pairs_proximities_docids_readers.push(readers.words_pairs_proximities_docids); - documents_readers.push(readers.documents); + let Readers { + main, + word_docids, + docid_word_positions, + words_pairs_proximities_docids, + facet_field_value_docids, + documents + } = readers; + main_readers.push(main); + word_docids_readers.push(word_docids); + docid_word_positions_readers.push(docid_word_positions); + words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); + facet_field_value_docids_readers.push(facet_field_value_docids); + documents_readers.push(documents); }); // This is the function that merge the readers @@ -413,6 +426,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { vec![ (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), + ( + DatabaseType::FacetValuesDocids, + facet_field_value_docids_readers, + facet_field_value_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -463,9 +481,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; + let total_databases = 6; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, - total_databases: 5, + total_databases, }); debug!("Writing the docid word positions into LMDB on disk..."); @@ -480,7 +500,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); debug!("Writing the documents into LMDB on disk..."); @@ -495,7 +515,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); debug!("Writing the words pairs proximities docids into LMDB on disk..."); @@ -510,7 +530,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); for (db_type, result) in receiver { @@ -537,16 +557,27 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::FacetValuesDocids => { + debug!("Writing the facet values docids into LMDB on disk..."); + let db = *self.index.facet_field_id_value_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + facet_field_value_docids_merge, + write_method, + )?; + }, } database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); } - debug_assert_eq!(database_count, 5); + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 6d42f0119..9c75f10fe 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::fs::File; @@ -5,22 +6,29 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::Context; +use anyhow::{bail, Context}; use bstr::ByteSlice as _; +use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use serde_json::Value; use tempfile::tempfile; +use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId}; +use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge}; +use super::merge_function::{ + main_merge, word_docids_merge, words_pairs_proximities_docids_merge, + facet_field_value_docids_merge, +}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; @@ -33,17 +41,21 @@ pub struct Readers { pub word_docids: Reader, pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, + pub facet_field_value_docids: Reader, pub documents: Reader, } pub struct Store { // Indexing parameters searchable_fields: HashSet, + faceted_fields: HashMap, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, + facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, + facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, chunk_compression_level: Option, @@ -52,6 +64,7 @@ pub struct Store { main_sorter: Sorter, word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, + facet_field_value_docids_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -60,6 +73,7 @@ pub struct Store { impl Store { pub fn new( searchable_fields: HashSet, + faceted_fields: HashMap, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -69,7 +83,7 @@ impl Store { ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 3)); + let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( @@ -96,6 +110,14 @@ impl Store { max_nb_chunks, max_memory, ); + let facet_field_value_docids_sorter = create_sorter( + facet_field_value_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let documents_writer = tempfile().and_then(|f| { create_writer(chunk_compression_type, chunk_compression_level, f) @@ -107,11 +129,14 @@ impl Store { Ok(Store { // Indexing parameters. searchable_fields, + faceted_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, + facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_value_docids_limit: linked_hash_map_size, // MTBL parameters chunk_compression_type, chunk_compression_level, @@ -120,6 +145,7 @@ impl Store { main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, + facet_field_value_docids_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -147,6 +173,35 @@ impl Store { Ok(()) } + // Save the documents ids under the facet field id and value we have seen it. + fn insert_facet_values_docid( + &mut self, + field_id: u8, + field_value: FacetValue, + id: DocumentId, + ) -> anyhow::Result<()> + { + let key = (field_id, field_value); + // if get_refresh finds the element it is assured to be at the end of the linked hash map. + match self.facet_field_value_docids.get_refresh(&key) { + Some(old) => { old.insert(id); }, + None => { + // A newly inserted element is append at the end of the linked hash map. + self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + // If the word docids just reached it's capacity we must make sure to remove + // one element, this way next time we insert we doesn't grow the capacity. + if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { + // Removing the front element is equivalent to removing the LRU element. + Self::write_docid_facet_field_values( + &mut self.facet_field_value_docids_sorter, + self.facet_field_value_docids.pop_front(), + )?; + } + } + } + Ok(()) + } + // Save the documents ids under the words pairs proximities that it contains. fn insert_words_pairs_proximities_docids<'a>( &mut self, @@ -187,7 +242,8 @@ impl Store { fn write_document( &mut self, document_id: DocumentId, - words_positions: &HashMap>, + words_positions: &mut HashMap>, + facet_values: &mut HashMap>, record: &[u8], ) -> anyhow::Result<()> { @@ -196,13 +252,20 @@ impl Store { self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; // We store document_id associated with all the words the record contains. - for (word, _) in words_positions { - self.insert_word_docid(word, document_id)?; + for (word, _) in words_positions.drain() { + self.insert_word_docid(&word, document_id)?; } self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + // We store document_id associated with all the field id and values. + for (field, values) in facet_values.drain() { + for value in values { + self.insert_facet_values_docid(field, value, document_id)?; + } + } + Ok(()) } @@ -263,6 +326,31 @@ impl Store { Ok(()) } + fn write_docid_facet_field_values( + sorter: &mut Sorter, + iter: I, + ) -> anyhow::Result<()> + where I: IntoIterator + { + use FacetValue::*; + + for ((field_id, value), docids) in iter { + let result = match value { + String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), + Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), + Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), + }; + let key = result.context("could not serialize facet key")?; + let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) + .context("could not serialize docids")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &bytes)?; + } + } + + Ok(()) + } + fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> where I: IntoIterator, RoaringBitmap)> { @@ -301,6 +389,7 @@ impl Store { let mut before = Instant::now(); let mut words_positions = HashMap::new(); + let mut facet_values = HashMap::new(); let mut count: usize = 0; while let Some((key, value)) = documents.next()? { @@ -320,27 +409,34 @@ impl Store { } for (attr, content) in document.iter() { - if !self.searchable_fields.contains(&attr) { - continue; - } + if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { + let value = serde_json::from_slice(content)?; - let value = serde_json::from_slice(content)?; - let content = match json_to_string(value) { - Some(content) => content, - None => continue, - }; + if let Some(ftype) = self.faceted_fields.get(&attr) { + let mut values = parse_facet_value(*ftype, &value).with_context(|| { + format!("extracting facets from the value {}", value) + })?; + facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); + } - let tokens = simple_tokenizer(&content).filter_map(only_token); - for (pos, token) in tokens.enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + if self.searchable_fields.contains(&attr) { + let content = match json_to_string(&value) { + Some(content) => content, + None => continue, + }; + + let tokens = simple_tokenizer(&content).filter_map(only_token); + for (pos, token) in tokens.enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + } + } } } // We write the document in the documents store. - self.write_document(document_id, &words_positions, value)?; - words_positions.clear(); + self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; } // Compute the document id of the next document. @@ -367,6 +463,10 @@ impl Store { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; + Self::write_docid_facet_field_values( + &mut self.facet_field_value_docids_sorter, + self.facet_field_value_docids, + )?; let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut builder = fst::SetBuilder::memory(); @@ -388,9 +488,13 @@ impl Store { let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -399,6 +503,7 @@ impl Store { word_docids, docid_word_positions, words_pairs_proximities_docids, + facet_field_value_docids, documents, }) } @@ -444,3 +549,70 @@ fn format_count(n: usize) -> String { fn lmdb_key_valid_size(key: &[u8]) -> bool { !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH } + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum FacetValue { + String(SmallString32), + Float(OrderedFloat), + Integer(i64), +} + +fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { + use FacetValue::*; + + fn inner_parse_facet_value( + ftype: FacetType, + value: &Value, + can_recurse: bool, + output: &mut SmallVec8, + ) -> anyhow::Result<()> + { + match value { + Value::Null => Ok(()), + Value::Bool(b) => Ok(output.push(Integer(*b as i64))), + Value::Number(number) => match ftype { + FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), + FacetType::Float => match number.as_f64() { + Some(float) => Ok(output.push(Float(OrderedFloat(float)))), + None => bail!("invalid facet type, expecting {} found integer", ftype), + }, + FacetType::Integer => match number.as_i64() { + Some(integer) => Ok(output.push(Integer(integer))), + None => if number.is_f64() { + bail!("invalid facet type, expecting {} found float", ftype) + } else { + bail!("invalid facet type, expecting {} found out-of-bound integer (64bit)", ftype) + }, + }, + }, + Value::String(string) => { + let string = string.trim(); + if string.is_empty() { return Ok(()) } + match ftype { + FacetType::String => { + let string = SmallString32::from(string); + Ok(output.push(String(string))) + }, + FacetType::Float => match string.parse() { + Ok(float) => Ok(output.push(Float(OrderedFloat(float)))), + Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), + }, + FacetType::Integer => match string.parse() { + Ok(integer) => Ok(output.push(Integer(integer))), + Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), + }, + } + }, + Value::Array(values) => if can_recurse { + values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() + } else { + bail!("invalid facet type, expecting {} found sub-array ()", ftype) + }, + Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), + } + } + + let mut facet_values = SmallVec8::new(); + inner_parse_facet_value(ftype, value, true, &mut facet_values)?; + Ok(facet_values) +} diff --git a/src/update/settings.rs b/src/update/settings.rs index ad191ceec..03f184ef6 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -1,9 +1,13 @@ -use anyhow::Context; +use std::collections::HashMap; +use std::str::FromStr; + +use anyhow::{ensure, Context}; use grenad::CompressionType; use rayon::ThreadPool; use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::facet::FacetType; use crate::{Index, FieldsIdsMap}; pub struct Settings<'a, 't, 'u, 'i> { @@ -22,6 +26,7 @@ pub struct Settings<'a, 't, 'u, 'i> { // however if it is `Some(None)` it means that the user forced a reset of the setting. searchable_fields: Option>>, displayed_fields: Option>>, + faceted_fields: Option>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -39,6 +44,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { thread_pool: None, searchable_fields: None, displayed_fields: None, + faceted_fields: None, } } @@ -58,71 +64,92 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.displayed_fields = Some(Some(names)); } + pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + self.faceted_fields = Some(names_facet_types); + } + pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep) + Sync { - // Check that the searchable attributes have been specified. - if let Some(value) = self.searchable_fields { - let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; - let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut updated_searchable_fields = None; + let mut updated_faceted_fields = None; + let mut updated_displayed_fields = None; - let result = match value { - Some(fields_names) => { - let mut fields_ids_map = current_fields_ids_map.clone(); - let searchable_fields: Vec<_> = - fields_names.iter() - .map(|name| fields_ids_map.insert(name)) - .collect::>>() - .context("field id limit reached")?; + // Construct the new FieldsIdsMap based on the searchable fields order. + let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut fields_ids_map = match self.searchable_fields { + Some(Some(searchable_fields)) => { + let mut new_fields_ids_map = FieldsIdsMap::new(); + let mut new_searchable_fields = Vec::new(); - // If the searchable fields are ordered we don't have to generate a new `FieldsIdsMap`. - if searchable_fields.windows(2).all(|win| win[0] < win[1]) { - ( - fields_ids_map, - Some(searchable_fields), - current_displayed_fields.map(ToOwned::to_owned), - ) - } else { - // We create or generate the fields ids corresponding to those names. - let mut fields_ids_map = FieldsIdsMap::new(); - let mut searchable_fields = Vec::new(); - for name in fields_names { - let id = fields_ids_map.insert(&name).context("field id limit reached")?; - searchable_fields.push(id); - } + for name in searchable_fields { + let id = new_fields_ids_map.insert(&name).context("field id limit reached")?; + new_searchable_fields.push(id); + } - // We complete the new FieldsIdsMap with the previous names. - for (_id, name) in current_fields_ids_map.iter() { - fields_ids_map.insert(name).context("field id limit reached")?; - } + for (_, name) in fields_ids_map.iter() { + new_fields_ids_map.insert(name).context("field id limit reached")?; + } - // We must also update the displayed fields according to the new `FieldsIdsMap`. - let displayed_fields = match current_displayed_fields { - Some(fields) => { - let mut displayed_fields = Vec::new(); - for id in fields { - let name = current_fields_ids_map.name(*id).unwrap(); - let id = fields_ids_map.id(name).context("field id limit reached")?; - displayed_fields.push(id); - } - Some(displayed_fields) - }, - None => None, - }; + updated_searchable_fields = Some(Some(new_searchable_fields)); + new_fields_ids_map + }, + Some(None) => { + updated_searchable_fields = Some(None); + fields_ids_map + }, + None => fields_ids_map, + }; - (fields_ids_map, Some(searchable_fields), displayed_fields) + // We compute or generate the new primary key field id. + // TODO make the primary key settable. + let primary_key = match self.index.primary_key(&self.wtxn)? { + Some(id) => { + let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let name = current_fields_ids_map.name(id).unwrap(); + fields_ids_map.insert(name).context("field id limit reached")? + }, + None => fields_ids_map.insert("id").context("field id limit reached")?, + }; + + if let Some(fields_names_facet_types) = self.faceted_fields { + let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; + + let mut faceted_fields = HashMap::new(); + for (name, sftype) in fields_names_facet_types { + let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?; + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + match current_faceted_fields.get(&id) { + Some(pftype) => { + ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype); + faceted_fields.insert(id, ftype) + }, + None => faceted_fields.insert(id, ftype), + }; + } + + updated_faceted_fields = Some(faceted_fields); + } + + // Check that the displayed attributes have been specified. + if let Some(value) = self.displayed_fields { + match value { + Some(names) => { + let mut new_displayed_fields = Vec::new(); + for name in names { + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + new_displayed_fields.push(id); } - }, - None => ( - current_fields_ids_map.clone(), - None, - current_displayed_fields.map(ToOwned::to_owned), - ), - }; - - let (mut fields_ids_map, searchable_fields, displayed_fields) = result; + updated_displayed_fields = Some(Some(new_displayed_fields)); + } + None => updated_displayed_fields = Some(None), + } + } + // If any setting have modified any of the datastructures it means that we need + // to retrieve the documents and then reindex then with the new settings. + if updated_searchable_fields.is_some() || updated_faceted_fields.is_some() { let transform = Transform { rtxn: &self.wtxn, index: self.index, @@ -136,15 +163,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { autogenerate_docids: false, }; - // We compute or generate the new primary key field id. - let primary_key = match self.index.primary_key(&self.wtxn)? { - Some(id) => { - let name = current_fields_ids_map.name(id).unwrap(); - fields_ids_map.insert(name).context("field id limit reached")? - }, - None => fields_ids_map.insert("id").context("field id limit reached")?, - }; - // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; @@ -152,18 +170,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // this way next indexing methods will be based on that. self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; - // The new searchable fields are also written down to make sure - // that the IndexDocuments system takes only these ones into account. - match searchable_fields { - Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?, - None => self.index.delete_searchable_fields(self.wtxn).map(drop)?, + if let Some(faceted_fields) = updated_faceted_fields { + // We write the faceted_fields fields into the database here. + self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; } - // We write the displayed fields into the database here - // to make sure that the right fields are displayed. - match displayed_fields { - Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, - None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, + if let Some(searchable_fields) = updated_searchable_fields { + // The new searchable fields are also written down to make sure + // that the IndexDocuments system takes only these ones into account. + match searchable_fields { + Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?, + None => self.index.delete_searchable_fields(self.wtxn).map(drop)?, + } } // We clear the full database (words-fst, documents ids and documents content). @@ -180,33 +198,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; - indexing_builder.execute_raw(output, progress_callback)?; + indexing_builder.execute_raw(output, &progress_callback)?; } - // Check that the displayed attributes have been specified. - if let Some(value) = self.displayed_fields { - match value { - // If it has been set, and it was a list of fields names, we create - // or generate the fields ids corresponds to those names and store them - // in the database in the order they were specified. - Some(fields_names) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - // We create or generate the fields ids corresponding to those names. - let mut fields_ids = Vec::new(); - for name in fields_names { - let id = fields_ids_map.insert(&name).context("field id limit reached")?; - fields_ids.push(id); - } - - self.index.put_displayed_fields(self.wtxn, &fields_ids)?; - }, - // If it was set to `null` it means that the user wants to get the default behavior - // which is displaying all the attributes in no specific order (FieldsIdsMap order), - // we just have to delete the displayed fields. - None => { - self.index.delete_displayed_fields(self.wtxn)?; - }, + if let Some(displayed_fields) = updated_displayed_fields { + // We write the displayed fields into the database here + // to make sure that the right fields are displayed. + match displayed_fields { + Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, + None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, } } @@ -219,6 +219,7 @@ mod tests { use super::*; use crate::update::{IndexDocuments, UpdateFormat}; use heed::EnvOpenOptions; + use maplit::hashmap; #[test] fn set_and_reset_searchable_fields() { @@ -386,4 +387,31 @@ mod tests { assert_eq!(fields_ids, None); drop(rtxn); } + + #[test] + fn set_faceted_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "age".into() => "integer".into() }); + builder.execute(|_| ()).unwrap(); + + // Then index some documents. + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); + drop(rtxn); + } }