Implement proximityPrecision setting on milli side

This commit is contained in:
ManyTheFish 2023-12-06 15:49:02 +01:00
parent 0c3fa8cbc4
commit 467b49153d
6 changed files with 224 additions and 66 deletions

View file

@ -32,6 +32,7 @@ use super::helpers::{
MergeFn, MergeableReader,
};
use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::{FieldId, Result};
/// Extract data for each databases from obkv documents in parallel.
@ -52,7 +53,7 @@ pub(crate) fn data_from_obkv_documents(
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
// TODO: add a proximity database deactivation parameter.
proximity_precision: ProximityPrecision,
) -> Result<()> {
puffin::profile_function!();
@ -151,16 +152,17 @@ pub(crate) fn data_from_obkv_documents(
});
}
// TODO: Skip this part if deactivated
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
if proximity_precision == ProximityPrecision::WordScale {
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),

View file

@ -352,6 +352,7 @@ where
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type,
@ -392,6 +393,7 @@ where
dictionary.as_deref(),
max_positions_per_attributes,
exact_attributes,
proximity_precision,
)
});

View file

@ -12,6 +12,7 @@ use super::IndexerConfig;
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::{FieldsIdsMap, Index, OrderBy, Result};
@ -127,7 +128,7 @@ pub struct Settings<'a, 't, 'i> {
max_values_per_facet: Setting<usize>,
sort_facet_values_by: Setting<HashMap<String, OrderBy>>,
pagination_max_total_hits: Setting<usize>,
// TODO: add a proximity database deactivation attribute.
proximity_precision: Setting<ProximityPrecision>,
}
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
@ -159,6 +160,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
max_values_per_facet: Setting::NotSet,
sort_facet_values_by: Setting::NotSet,
pagination_max_total_hits: Setting::NotSet,
proximity_precision: Setting::NotSet,
indexer_config,
}
}
@ -333,6 +335,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.pagination_max_total_hits = Setting::Reset;
}
pub fn set_proximity_precision(&mut self, value: ProximityPrecision) {
self.proximity_precision = Setting::Set(value);
}
pub fn reset_proximity_precision(&mut self) {
self.proximity_precision = Setting::Reset;
}
fn reindex<FP, FA>(
&mut self,
progress_callback: &FP,
@ -862,6 +872,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(())
}
fn update_proximity_precision(&mut self) -> Result<bool> {
let changed = match self.proximity_precision {
Setting::Set(new) => {
let old = self.index.proximity_precision(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_proximity_precision(self.wtxn, new)?;
true
}
}
Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
where
FP: Fn(UpdateIndexingStep) + Sync,
@ -898,6 +926,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
let synonyms_updated = self.update_synonyms()?;
let searchable_updated = self.update_searchable()?;
let exact_attributes_updated = self.update_exact_attributes()?;
let proximity_precision = self.update_proximity_precision()?;
if stop_words_updated
|| non_separator_tokens_updated
@ -907,7 +936,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|| synonyms_updated
|| searchable_updated
|| exact_attributes_updated
// TODO: reindex if proximity database is activated
|| proximity_precision
{
self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
}
@ -1733,6 +1762,7 @@ mod tests {
max_values_per_facet,
sort_facet_values_by,
pagination_max_total_hits,
proximity_precision,
} = settings;
assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet));
@ -1754,6 +1784,7 @@ mod tests {
assert!(matches!(max_values_per_facet, Setting::NotSet));
assert!(matches!(sort_facet_values_by, Setting::NotSet));
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
assert!(matches!(proximity_precision, Setting::NotSet));
})
.unwrap();
}