Faceted fields settings must specify the facet type

This commit is contained in:
Clément Renault 2020-11-11 17:33:05 +01:00
parent ebe7087bff
commit 466fb601d6
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 44 additions and 27 deletions

View File

@ -193,7 +193,7 @@ impl Index {
/// Writes the facet fields ids associated with their facet type or `None` if /// Writes the facet fields ids associated with their facet type or `None` if
/// the facet type is currently unknown. /// the facet type is currently unknown.
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<u8, Option<FacetType>>) -> heed::Result<()> { pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<u8, FacetType>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types)
} }
@ -203,7 +203,7 @@ impl Index {
} }
/// Returns the facet fields ids associated with their facet type. /// Returns the facet fields ids associated with their facet type.
pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result<HashMap<u8, Option<FacetType>>> { pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result<HashMap<u8, FacetType>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
} }

View File

@ -61,9 +61,9 @@ pub fn obkv_to_json(
} }
/// Transform a JSON value into a string that can be indexed. /// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: Value) -> Option<String> { pub fn json_to_string(value: &Value) -> Option<String> {
fn inner(value: Value, output: &mut String) -> bool { fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write; use std::fmt::Write;
match value { match value {
Value::Null => false, Value::Null => false,
@ -122,7 +122,7 @@ mod tests {
"not_there": null, "not_there": null,
}); });
let string = json_to_string(value).unwrap(); let string = json_to_string(&value).unwrap();
assert_eq!(string, "name: John Doe. age: 43. "); assert_eq!(string, "name: John Doe. age: 43. ");
} }
@ -136,7 +136,7 @@ mod tests {
null, null,
]); ]);
let string = json_to_string(value).unwrap(); let string = json_to_string(&value).unwrap();
// We don't care about having two point (.) after the other as // We don't care about having two point (.) after the other as
// the distance of hard separators is clamped to 8 anyway. // the distance of hard separators is clamped to 8 anyway.
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");

View File

@ -329,6 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
WordDocids, WordDocids,
} }
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? {
Some(fields) => fields.iter().copied().collect(), Some(fields) => fields.iter().copied().collect(),
None => fields_ids_map.iter().map(|(id, _name)| id).collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
@ -362,6 +363,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.map(|(i, documents)| { .map(|(i, documents)| {
let store = Store::new( let store = Store::new(
searchable_fields.clone(), searchable_fields.clone(),
faceted_fields.clone(),
linked_hash_map_size, linked_hash_map_size,
max_nb_chunks, max_nb_chunks,
max_memory_by_job, max_memory_by_job,

View File

@ -14,6 +14,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use tempfile::tempfile; use tempfile::tempfile;
use crate::facet::FacetType;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token}; use crate::tokenizer::{simple_tokenizer, only_token};
use crate::update::UpdateIndexingStep; use crate::update::UpdateIndexingStep;
@ -39,6 +40,7 @@ pub struct Readers {
pub struct Store { pub struct Store {
// Indexing parameters // Indexing parameters
searchable_fields: HashSet<u8>, searchable_fields: HashSet<u8>,
faceted_fields: HashMap<u8, FacetType>,
// Caches // Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
@ -60,6 +62,7 @@ pub struct Store {
impl Store { impl Store {
pub fn new( pub fn new(
searchable_fields: HashSet<u8>, searchable_fields: HashSet<u8>,
faceted_fields: HashMap<u8, FacetType>,
linked_hash_map_size: Option<usize>, linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
max_memory: Option<usize>, max_memory: Option<usize>,
@ -107,6 +110,7 @@ impl Store {
Ok(Store { Ok(Store {
// Indexing parameters. // Indexing parameters.
searchable_fields, searchable_fields,
faceted_fields,
// Caches // Caches
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
word_docids_limit: linked_hash_map_size, word_docids_limit: linked_hash_map_size,
@ -320,21 +324,26 @@ impl Store {
} }
for (attr, content) in document.iter() { for (attr, content) in document.iter() {
if !self.searchable_fields.contains(&attr) { if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) {
continue; let value = serde_json::from_slice(content)?;
}
let value = serde_json::from_slice(content)?; if let Some(ftype) = self.faceted_fields.get(&attr) {
let content = match json_to_string(value) { todo!("parse facet field value")
Some(content) => content, }
None => continue,
};
let tokens = simple_tokenizer(&content).filter_map(only_token); if self.searchable_fields.contains(&attr) {
for (pos, token) in tokens.enumerate().take(MAX_POSITION) { let content = match json_to_string(&value) {
let word = token.to_lowercase(); Some(content) => content,
let position = (attr as usize * MAX_POSITION + pos) as u32; None => continue,
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); };
let tokens = simple_tokenizer(&content).filter_map(only_token);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
}
} }
} }

View File

@ -1,11 +1,13 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::str::FromStr;
use anyhow::Context; use anyhow::{ensure, Context};
use grenad::CompressionType; use grenad::CompressionType;
use rayon::ThreadPool; use rayon::ThreadPool;
use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::index_documents::{Transform, IndexDocumentsMethod};
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::facet::FacetType;
use crate::{Index, FieldsIdsMap}; use crate::{Index, FieldsIdsMap};
pub struct Settings<'a, 't, 'u, 'i> { pub struct Settings<'a, 't, 'u, 'i> {
@ -24,7 +26,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
// however if it is `Some(None)` it means that the user forced a reset of the setting. // however if it is `Some(None)` it means that the user forced a reset of the setting.
searchable_fields: Option<Option<Vec<String>>>, searchable_fields: Option<Option<Vec<String>>>,
displayed_fields: Option<Option<Vec<String>>>, displayed_fields: Option<Option<Vec<String>>>,
faceted_fields: Option<Vec<String>>, faceted_fields: Option<HashMap<String, String>>,
} }
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -62,25 +64,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.displayed_fields = Some(Some(names)); self.displayed_fields = Some(Some(names));
} }
pub fn set_faceted_fields(&mut self, names: Vec<String>) { pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) {
self.faceted_fields = Some(names); self.faceted_fields = Some(names_facet_types);
} }
pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()> pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync F: Fn(UpdateIndexingStep) + Sync
{ {
if let Some(fields_names) = self.faceted_fields { if let Some(fields_names_facet_types) = self.faceted_fields {
let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; let current_faceted_fields = self.index.faceted_fields(self.wtxn)?;
let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut fields_ids_map = current_fields_ids_map.clone(); let mut fields_ids_map = current_fields_ids_map.clone();
let mut faceted_fields = HashMap::new(); let mut faceted_fields = HashMap::new();
for name in fields_names { for (name, sftype) in fields_names_facet_types {
let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?;
let id = fields_ids_map.insert(&name).context("field id limit reached")?; let id = fields_ids_map.insert(&name).context("field id limit reached")?;
match current_faceted_fields.get(&id) { match current_faceted_fields.get(&id) {
Some(ftype) => faceted_fields.insert(id, ftype.clone()), Some(pftype) => {
None => faceted_fields.insert(id, None), ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype);
faceted_fields.insert(id, ftype)
},
None => faceted_fields.insert(id, ftype),
}; };
} }