From 466fb601d679cae10cf61c703beb3da7a957f0c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 17:33:05 +0100 Subject: [PATCH] Faceted fields settings must specify the facet type --- src/index.rs | 4 ++-- src/lib.rs | 8 +++---- src/update/index_documents/mod.rs | 2 ++ src/update/index_documents/store.rs | 35 ++++++++++++++++++----------- src/update/settings.rs | 22 +++++++++++------- 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/index.rs b/src/index.rs index 4946545e4..68d7dfe5f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -193,7 +193,7 @@ impl Index { /// Writes the facet fields ids associated with their facet type or `None` if /// the facet type is currently unknown. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap>) -> heed::Result<()> { + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) } @@ -203,7 +203,7 @@ impl Index { } /// Returns the facet fields ids associated with their facet type. - pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result>> { + pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } diff --git a/src/lib.rs b/src/lib.rs index 2e879fd03..0e9ee0ec0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,9 +61,9 @@ pub fn obkv_to_json( } /// Transform a JSON value into a string that can be indexed. -pub fn json_to_string(value: Value) -> Option { +pub fn json_to_string(value: &Value) -> Option { - fn inner(value: Value, output: &mut String) -> bool { + fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { Value::Null => false, @@ -122,7 +122,7 @@ mod tests { "not_there": null, }); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); assert_eq!(string, "name: John Doe. age: 43. "); } @@ -136,7 +136,7 @@ mod tests { null, ]); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); // We don't care about having two point (.) after the other as // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index f078ec9ed..c060b04a1 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -329,6 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { WordDocids, } + let faceted_fields = self.index.faceted_fields(self.wtxn)?; let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { Some(fields) => fields.iter().copied().collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(), @@ -362,6 +363,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .map(|(i, documents)| { let store = Store::new( searchable_fields.clone(), + faceted_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 6d42f0119..e9548eec3 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -14,6 +14,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use roaring::RoaringBitmap; use tempfile::tempfile; +use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; @@ -39,6 +40,7 @@ pub struct Readers { pub struct Store { // Indexing parameters searchable_fields: HashSet, + faceted_fields: HashMap, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, @@ -60,6 +62,7 @@ pub struct Store { impl Store { pub fn new( searchable_fields: HashSet, + faceted_fields: HashMap, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -107,6 +110,7 @@ impl Store { Ok(Store { // Indexing parameters. searchable_fields, + faceted_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids_limit: linked_hash_map_size, @@ -320,21 +324,26 @@ impl Store { } for (attr, content) in document.iter() { - if !self.searchable_fields.contains(&attr) { - continue; - } + if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { + let value = serde_json::from_slice(content)?; - let value = serde_json::from_slice(content)?; - let content = match json_to_string(value) { - Some(content) => content, - None => continue, - }; + if let Some(ftype) = self.faceted_fields.get(&attr) { + todo!("parse facet field value") + } - let tokens = simple_tokenizer(&content).filter_map(only_token); - for (pos, token) in tokens.enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + if self.searchable_fields.contains(&attr) { + let content = match json_to_string(&value) { + Some(content) => content, + None => continue, + }; + + let tokens = simple_tokenizer(&content).filter_map(only_token); + for (pos, token) in tokens.enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + } + } } } diff --git a/src/update/settings.rs b/src/update/settings.rs index f6528ed94..d5ffba23b 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -1,11 +1,13 @@ use std::collections::HashMap; +use std::str::FromStr; -use anyhow::Context; +use anyhow::{ensure, Context}; use grenad::CompressionType; use rayon::ThreadPool; use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::facet::FacetType; use crate::{Index, FieldsIdsMap}; pub struct Settings<'a, 't, 'u, 'i> { @@ -24,7 +26,7 @@ pub struct Settings<'a, 't, 'u, 'i> { // however if it is `Some(None)` it means that the user forced a reset of the setting. searchable_fields: Option>>, displayed_fields: Option>>, - faceted_fields: Option>, + faceted_fields: Option>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -62,25 +64,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.displayed_fields = Some(Some(names)); } - pub fn set_faceted_fields(&mut self, names: Vec) { - self.faceted_fields = Some(names); + pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + self.faceted_fields = Some(names_facet_types); } pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep) + Sync { - if let Some(fields_names) = self.faceted_fields { + if let Some(fields_names_facet_types) = self.faceted_fields { let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut fields_ids_map = current_fields_ids_map.clone(); let mut faceted_fields = HashMap::new(); - for name in fields_names { + for (name, sftype) in fields_names_facet_types { + let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?; let id = fields_ids_map.insert(&name).context("field id limit reached")?; match current_faceted_fields.get(&id) { - Some(ftype) => faceted_fields.insert(id, ftype.clone()), - None => faceted_fields.insert(id, None), + Some(pftype) => { + ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype); + faceted_fields.insert(id, ftype) + }, + None => faceted_fields.insert(id, ftype), }; }