Faceted fields settings must specify the facet type

This commit is contained in:
Clément Renault 2020-11-11 17:33:05 +01:00
parent ebe7087bff
commit 466fb601d6
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 44 additions and 27 deletions

View File

@ -193,7 +193,7 @@ impl Index {
/// Writes the facet fields ids associated with their facet type or `None` if
/// the facet type is currently unknown.
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<u8, Option<FacetType>>) -> heed::Result<()> {
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<u8, FacetType>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types)
}
@ -203,7 +203,7 @@ impl Index {
}
/// Returns the facet fields ids associated with their facet type.
pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result<HashMap<u8, Option<FacetType>>> {
pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result<HashMap<u8, FacetType>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
}

View File

@ -61,9 +61,9 @@ pub fn obkv_to_json(
}
/// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: Value) -> Option<String> {
pub fn json_to_string(value: &Value) -> Option<String> {
fn inner(value: Value, output: &mut String) -> bool {
fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write;
match value {
Value::Null => false,
@ -122,7 +122,7 @@ mod tests {
"not_there": null,
});
let string = json_to_string(value).unwrap();
let string = json_to_string(&value).unwrap();
assert_eq!(string, "name: John Doe. age: 43. ");
}
@ -136,7 +136,7 @@ mod tests {
null,
]);
let string = json_to_string(value).unwrap();
let string = json_to_string(&value).unwrap();
// We don't care about having two point (.) after the other as
// the distance of hard separators is clamped to 8 anyway.
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");

View File

@ -329,6 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
WordDocids,
}
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? {
Some(fields) => fields.iter().copied().collect(),
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
@ -362,6 +363,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.map(|(i, documents)| {
let store = Store::new(
searchable_fields.clone(),
faceted_fields.clone(),
linked_hash_map_size,
max_nb_chunks,
max_memory_by_job,

View File

@ -14,6 +14,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use roaring::RoaringBitmap;
use tempfile::tempfile;
use crate::facet::FacetType;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token};
use crate::update::UpdateIndexingStep;
@ -39,6 +40,7 @@ pub struct Readers {
pub struct Store {
// Indexing parameters
searchable_fields: HashSet<u8>,
faceted_fields: HashMap<u8, FacetType>,
// Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
@ -60,6 +62,7 @@ pub struct Store {
impl Store {
pub fn new(
searchable_fields: HashSet<u8>,
faceted_fields: HashMap<u8, FacetType>,
linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
@ -107,6 +110,7 @@ impl Store {
Ok(Store {
// Indexing parameters.
searchable_fields,
faceted_fields,
// Caches
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
word_docids_limit: linked_hash_map_size,
@ -320,21 +324,26 @@ impl Store {
}
for (attr, content) in document.iter() {
if !self.searchable_fields.contains(&attr) {
continue;
}
if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content)?;
let value = serde_json::from_slice(content)?;
let content = match json_to_string(value) {
Some(content) => content,
None => continue,
};
if let Some(ftype) = self.faceted_fields.get(&attr) {
todo!("parse facet field value")
}
let tokens = simple_tokenizer(&content).filter_map(only_token);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) {
Some(content) => content,
None => continue,
};
let tokens = simple_tokenizer(&content).filter_map(only_token);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
}
}
}

View File

@ -1,11 +1,13 @@
use std::collections::HashMap;
use std::str::FromStr;
use anyhow::Context;
use anyhow::{ensure, Context};
use grenad::CompressionType;
use rayon::ThreadPool;
use crate::update::index_documents::{Transform, IndexDocumentsMethod};
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::facet::FacetType;
use crate::{Index, FieldsIdsMap};
pub struct Settings<'a, 't, 'u, 'i> {
@ -24,7 +26,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
// however if it is `Some(None)` it means that the user forced a reset of the setting.
searchable_fields: Option<Option<Vec<String>>>,
displayed_fields: Option<Option<Vec<String>>>,
faceted_fields: Option<Vec<String>>,
faceted_fields: Option<HashMap<String, String>>,
}
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -62,25 +64,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.displayed_fields = Some(Some(names));
}
pub fn set_faceted_fields(&mut self, names: Vec<String>) {
self.faceted_fields = Some(names);
pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) {
self.faceted_fields = Some(names_facet_types);
}
pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()>
where
F: Fn(UpdateIndexingStep) + Sync
{
if let Some(fields_names) = self.faceted_fields {
if let Some(fields_names_facet_types) = self.faceted_fields {
let current_faceted_fields = self.index.faceted_fields(self.wtxn)?;
let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut fields_ids_map = current_fields_ids_map.clone();
let mut faceted_fields = HashMap::new();
for name in fields_names {
for (name, sftype) in fields_names_facet_types {
let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?;
let id = fields_ids_map.insert(&name).context("field id limit reached")?;
match current_faceted_fields.get(&id) {
Some(ftype) => faceted_fields.insert(id, ftype.clone()),
None => faceted_fields.insert(id, None),
Some(pftype) => {
ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype);
faceted_fields.insert(id, ftype)
},
None => faceted_fields.insert(id, ftype),
};
}