Make sure that we generate the faceted database when required

This commit is contained in:
Kerollmops 2021-06-01 16:29:14 +02:00
parent b0c0490e85
commit 3c304c89d4
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 135 additions and 42 deletions

View File

@ -23,7 +23,7 @@ use crate::fields_ids_map::FieldsIdsMap;
pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
@ -365,7 +365,7 @@ impl Index {
/// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields.
pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
let filterable_fields = self.filterable_fields(rtxn)?;
let distinct_field = self.distinct_attribute(rtxn)?;
let distinct_field = self.distinct_field(rtxn)?;
let asc_desc_fields = self.criteria(rtxn)?
.into_iter()
.filter_map(|criterion| match criterion {
@ -465,18 +465,18 @@ impl Index {
}
}
/* Distinct attribute */
/* distinct field */
pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> {
self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute)
pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> {
self.main.put::<_, Str, Str>(wtxn, DISTINCT_FIELD_KEY, distinct_field)
}
pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result<Option<&'a str>> {
self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY)
pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result<Option<&'a str>> {
self.main.get::<_, Str, Str>(rtxn, DISTINCT_FIELD_KEY)
}
pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY)
pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, DISTINCT_FIELD_KEY)
}
/* criteria */

View File

@ -79,7 +79,7 @@ mod test {
// set distinct and faceted attributes for the index.
let builder = UpdateBuilder::new(0);
let mut update = builder.settings(&mut txn, &index);
update.set_distinct_attribute(distinct.to_string());
update.set_distinct_field(distinct.to_string());
update.execute(|_, _| ()).unwrap();
// add documents to the index

View File

@ -136,7 +136,7 @@ impl<'a> Search<'a> {
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?;
match self.index.distinct_attribute(self.rtxn)? {
match self.index.distinct_field(self.rtxn)? {
None => self.perform_sort(NoopDistinct, matching_words, criteria),
Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;

View File

@ -57,14 +57,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
pub fn execute(self) -> anyhow::Result<()> {
self.index.set_updated_at(self.wtxn, &Utc::now())?;
// We get the filterable fields to be able to create the facet levels.
let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?;
// We get the faceted fields to be able to create the facet levels.
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for field_id in filterable_fields {
// Compute and store the filterable strings documents ids.
let string_documents_ids = compute_filterable_documents_ids(
for field_id in faceted_fields {
// Compute and store the faceted strings documents ids.
let string_documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
field_id,
@ -77,8 +77,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
field_id,
)?;
// Compute and store the filterable numbers documents ids.
let number_documents_ids = compute_filterable_documents_ids(
// Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
field_id,
@ -191,7 +191,7 @@ fn compute_facet_number_levels<'t>(
writer_into_reader(writer, shrink_size)
}
fn compute_filterable_documents_ids(
fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,

View File

@ -68,7 +68,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
filterable_fields: Setting<HashSet<String>>,
criteria: Setting<Vec<String>>,
stop_words: Setting<BTreeSet<String>>,
distinct_attribute: Setting<String>,
distinct_field: Setting<String>,
synonyms: Setting<HashMap<String, Vec<String>>>,
}
@ -94,7 +94,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
filterable_fields: Setting::NotSet,
criteria: Setting::NotSet,
stop_words: Setting::NotSet,
distinct_attribute: Setting::NotSet,
distinct_field: Setting::NotSet,
synonyms: Setting::NotSet,
update_id,
}
@ -144,12 +144,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
pub fn reset_distinct_attribute(&mut self) {
self.distinct_attribute = Setting::Reset;
pub fn reset_distinct_field(&mut self) {
self.distinct_field = Setting::Reset;
}
pub fn set_distinct_attribute(&mut self, distinct_attribute: String) {
self.distinct_attribute = Setting::Set(distinct_attribute);
pub fn set_distinct_field(&mut self, distinct_field: String) {
self.distinct_field = Setting::Set(distinct_field);
}
pub fn reset_synonyms(&mut self) {
@ -165,8 +165,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
where
F: Fn(UpdateIndexingStep, u64) + Sync
where
F: Fn(UpdateIndexingStep, u64) + Sync
{
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let update_id = self.update_id;
@ -197,7 +197,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let output = transform.remap_index_documents(
primary_key.to_string(),
old_fields_ids_map,
fields_ids_map.clone())?;
fields_ids_map.clone(),
)?;
// We clear the full database (words-fst, documents ids and documents content).
ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?;
@ -214,6 +215,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
indexing_builder.thread_pool = self.thread_pool;
indexing_builder.execute_raw(output, &cb)?;
Ok(())
}
@ -242,18 +244,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Ok(true)
}
fn update_distinct_attribute(&mut self) -> anyhow::Result<bool> {
match self.distinct_attribute {
fn update_distinct_field(&mut self) -> anyhow::Result<bool> {
match self.distinct_field {
Setting::Set(ref attr) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map
.insert(attr)
.context("field id limit exceeded")?;
self.index.put_distinct_attribute(self.wtxn, &attr)?;
self.index.put_distinct_field(self.wtxn, &attr)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; },
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; },
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -380,7 +382,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
fn update_facets(&mut self) -> anyhow::Result<bool> {
fn update_filterable(&mut self) -> anyhow::Result<()> {
match self.filterable_fields {
Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
@ -393,9 +395,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; }
Setting::NotSet => return Ok(false)
Setting::NotSet => (),
}
Ok(true)
Ok(())
}
fn update_criteria(&mut self) -> anyhow::Result<()> {
@ -419,20 +421,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
F: Fn(UpdateIndexingStep, u64) + Sync
{
self.index.set_updated_at(self.wtxn, &Utc::now())?;
let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
self.update_displayed()?;
let stop_words_updated = self.update_stop_words()?;
let facets_updated = self.update_facets()?;
self.update_distinct_attribute()?;
// update_criteria MUST be called after update_facets, since criterion fields must be set
// as facets.
self.update_filterable()?;
self.update_distinct_field()?;
self.update_criteria()?;
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
let faceted_updated = old_faceted_fields != new_faceted_fields;
let stop_words_updated = self.update_stop_words()?;
let synonyms_updated = self.update_synonyms()?;
let searchable_updated = self.update_searchable()?;
if stop_words_updated || facets_updated || synonyms_updated || searchable_updated {
if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated {
self.reindex(&progress_callback, old_fields_ids_map)?;
}
Ok(())
}
}
@ -444,7 +455,7 @@ mod tests {
use maplit::{btreeset, hashmap, hashset};
use big_s::S;
use crate::{Criterion, FilterCondition};
use crate::{Criterion, FilterCondition, SearchResult};
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
@ -669,6 +680,88 @@ mod tests {
assert_eq!(count, 4);
}
#[test]
fn set_asc_desc_field() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the filterable fields to be the age.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
// Don't display the generated `id` field.
builder.set_displayed_fields(vec![S("name"), S("age")]);
builder.set_criteria(vec![S("asc(age)")]);
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Run an empty query just to ensure that the search results are ordered.
let rtxn = index.read_txn().unwrap();
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
let documents = index.documents(&rtxn, documents_ids).unwrap();
// Fetch the documents "age" field in the ordre in which the documents appear.
let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap();
let iter = documents.into_iter().map(|(_, doc)| {
let bytes = doc.get(age_field_id).unwrap();
let string = std::str::from_utf8(bytes).unwrap();
string.parse::<u32>().unwrap()
});
assert_eq!(iter.collect::<Vec<_>>(), vec![21, 23, 34]);
}
#[test]
fn set_distinct_field() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the filterable fields to be the age.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
// Don't display the generated `id` field.
builder.set_displayed_fields(vec![S("name"), S("age")]);
builder.set_distinct_field(S("age"));
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 },
{ "name": "bernard", "age": 34 },
{ "name": "bertrand", "age": 34 },
{ "name": "bernie", "age": 34 },
{ "name": "ben", "age": 34 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Run an empty query just to ensure that the search results are ordered.
let rtxn = index.read_txn().unwrap();
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
// There must be at least one document with a 34 as the age.
assert_eq!(documents_ids.len(), 3);
}
#[test]
fn default_stop_words() {
let path = tempfile::tempdir().unwrap();