1977 lines
74 KiB
Rust
Raw Normal View History

use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
2023-12-07 23:05:26 +01:00
use std::convert::TryInto;
2024-08-27 17:52:09 +02:00
use std::num::NonZeroUsize;
use std::result::Result as StdResult;
2023-12-07 23:05:26 +01:00
use std::sync::Arc;
2020-11-11 17:08:18 +01:00
2023-06-28 18:52:32 +02:00
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
2023-02-13 18:45:13 +01:00
use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
2022-02-15 11:41:55 +01:00
use time::OffsetDateTime;
use super::del_add::DelAddOperation;
use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{
2024-11-19 15:57:56 +01:00
IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::order_by_map::OrderByMap;
2024-08-27 17:52:09 +02:00
use crate::prompt::default_max_bytes;
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::settings::{
EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction,
SubEmbeddingSettings, WriteBackToDocuments,
};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result};
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub enum Setting<T> {
Set(T),
Reset,
NotSet,
}
2023-02-13 18:45:13 +01:00
impl<T, E> Deserr<E> for Setting<T>
2023-01-11 12:14:17 +01:00
where
2023-02-13 18:45:13 +01:00
T: Deserr<E>,
2023-01-11 12:14:17 +01:00
E: DeserializeError,
{
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef<'_>,
2023-01-11 12:14:17 +01:00
) -> std::result::Result<Self, E> {
match value {
deserr::Value::Null => Ok(Setting::Reset),
_ => T::deserialize_from_value(value, location).map(Setting::Set),
}
}
}
impl<T> Default for Setting<T> {
fn default() -> Self {
Self::NotSet
}
}
impl<T> Setting<T> {
2021-10-13 13:05:07 +02:00
pub fn set(self) -> Option<T> {
match self {
Self::Set(value) => Some(value),
_ => None,
}
}
2024-07-22 12:40:04 +02:00
pub fn some_or_not_set(option: Option<T>) -> Self {
match option {
Some(value) => Setting::Set(value),
None => Setting::NotSet,
}
}
2021-10-13 13:05:07 +02:00
pub const fn as_ref(&self) -> Setting<&T> {
match *self {
Self::Set(ref value) => Setting::Set(value),
Self::Reset => Setting::Reset,
Self::NotSet => Setting::NotSet,
}
}
pub const fn is_not_set(&self) -> bool {
matches!(self, Self::NotSet)
}
/// If `Self` is `Reset`, then map self to `Set` with the provided `val`.
pub fn or_reset(self, val: T) -> Self {
match self {
Self::Reset => Self::Set(val),
otherwise => otherwise,
}
}
2024-11-19 17:06:00 +01:00
/// Returns other if self is not set.
pub fn or(self, other: Self) -> Self {
match self {
Setting::Set(_) | Setting::Reset => self,
Setting::NotSet => other,
}
}
/// Returns `true` if applying the new setting changed this setting
pub fn apply(&mut self, new: Self) -> bool
where
T: PartialEq + Eq,
{
if let Setting::NotSet = new {
return false;
}
if self == &new {
return false;
}
*self = new;
true
}
}
impl<T: Serialize> Serialize for Setting<T> {
2021-06-16 18:33:33 +02:00
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error>
where
S: Serializer,
{
match self {
Self::Set(value) => Some(value),
// Usually not_set isn't serialized by setting skip_serializing_if field attribute
Self::NotSet | Self::Reset => None,
2021-06-16 18:33:33 +02:00
}
.serialize(serializer)
}
}
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
2021-06-16 18:33:33 +02:00
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
where
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(|x| match x {
Some(x) => Self::Set(x),
None => Self::Reset, // Reset is forced by sending null value
})
}
}
pub struct Settings<'a, 't, 'i> {
wtxn: &'t mut heed::RwTxn<'i>,
index: &'i Index,
indexer_config: &'a IndexerConfig,
searchable_fields: Setting<Vec<String>>,
displayed_fields: Setting<Vec<String>>,
filterable_fields: Setting<HashSet<String>>,
2021-08-23 11:37:18 +02:00
sortable_fields: Setting<HashSet<String>>,
2023-01-11 12:14:17 +01:00
criteria: Setting<Vec<Criterion>>,
stop_words: Setting<BTreeSet<String>>,
2023-07-24 17:00:18 +02:00
non_separator_tokens: Setting<BTreeSet<String>>,
separator_tokens: Setting<BTreeSet<String>>,
dictionary: Setting<BTreeSet<String>>,
distinct_field: Setting<String>,
synonyms: Setting<BTreeMap<String, Vec<String>>>,
primary_key: Setting<String>,
2022-03-16 10:03:18 +01:00
authorize_typos: Setting<bool>,
min_word_len_two_typos: Setting<u8>,
min_word_len_one_typo: Setting<u8>,
2022-03-22 09:55:49 +01:00
exact_words: Setting<BTreeSet<String>>,
2022-04-04 21:04:49 +02:00
/// Attributes on which typo tolerance is disabled.
2022-03-22 19:07:59 +01:00
exact_attributes: Setting<HashSet<String>>,
max_values_per_facet: Setting<usize>,
sort_facet_values_by: Setting<OrderByMap>,
pagination_max_total_hits: Setting<usize>,
proximity_precision: Setting<ProximityPrecision>,
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
2024-03-11 18:24:21 +01:00
search_cutoff: Setting<u64>,
localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>,
2024-11-19 15:57:56 +01:00
prefix_search: Setting<PrefixSearch>,
facet_search: Setting<bool>,
}
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i>,
index: &'i Index,
indexer_config: &'a IndexerConfig,
) -> Settings<'a, 't, 'i> {
Settings {
wtxn,
index,
searchable_fields: Setting::NotSet,
displayed_fields: Setting::NotSet,
filterable_fields: Setting::NotSet,
2021-08-23 11:37:18 +02:00
sortable_fields: Setting::NotSet,
criteria: Setting::NotSet,
stop_words: Setting::NotSet,
2023-07-24 17:00:18 +02:00
non_separator_tokens: Setting::NotSet,
separator_tokens: Setting::NotSet,
dictionary: Setting::NotSet,
distinct_field: Setting::NotSet,
2021-04-07 11:53:57 +03:00
synonyms: Setting::NotSet,
primary_key: Setting::NotSet,
2022-03-16 10:03:18 +01:00
authorize_typos: Setting::NotSet,
2022-03-21 14:03:31 +01:00
exact_words: Setting::NotSet,
2022-04-29 16:13:18 +02:00
min_word_len_two_typos: Setting::NotSet,
min_word_len_one_typo: Setting::NotSet,
exact_attributes: Setting::NotSet,
max_values_per_facet: Setting::NotSet,
sort_facet_values_by: Setting::NotSet,
pagination_max_total_hits: Setting::NotSet,
proximity_precision: Setting::NotSet,
embedder_settings: Setting::NotSet,
2024-03-11 18:24:21 +01:00
search_cutoff: Setting::NotSet,
localized_attributes_rules: Setting::NotSet,
2024-11-19 15:57:56 +01:00
prefix_search: Setting::NotSet,
facet_search: Setting::NotSet,
2022-03-22 09:55:49 +01:00
indexer_config,
}
}
pub fn reset_searchable_fields(&mut self) {
self.searchable_fields = Setting::Reset;
}
pub fn set_searchable_fields(&mut self, names: Vec<String>) {
self.searchable_fields = Setting::Set(names);
}
pub fn reset_displayed_fields(&mut self) {
self.displayed_fields = Setting::Reset;
}
pub fn set_displayed_fields(&mut self, names: Vec<String>) {
self.displayed_fields = Setting::Set(names);
}
pub fn reset_filterable_fields(&mut self) {
self.filterable_fields = Setting::Reset;
}
pub fn set_filterable_fields(&mut self, names: HashSet<String>) {
self.filterable_fields = Setting::Set(names);
2020-11-11 17:08:18 +01:00
}
2021-08-23 11:37:18 +02:00
pub fn set_sortable_fields(&mut self, names: HashSet<String>) {
self.sortable_fields = Setting::Set(names);
}
pub fn reset_sortable_fields(&mut self) {
self.sortable_fields = Setting::Reset;
}
2020-12-04 12:02:22 +01:00
pub fn reset_criteria(&mut self) {
self.criteria = Setting::Reset;
2020-12-04 12:02:22 +01:00
}
2023-01-11 12:14:17 +01:00
pub fn set_criteria(&mut self, criteria: Vec<Criterion>) {
self.criteria = Setting::Set(criteria);
2020-12-04 12:02:22 +01:00
}
pub fn reset_stop_words(&mut self) {
self.stop_words = Setting::Reset;
}
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
2021-06-16 18:33:33 +02:00
self.stop_words =
if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
}
2023-07-24 17:00:18 +02:00
pub fn reset_non_separator_tokens(&mut self) {
self.non_separator_tokens = Setting::Reset;
}
pub fn set_non_separator_tokens(&mut self, non_separator_tokens: BTreeSet<String>) {
self.non_separator_tokens = if non_separator_tokens.is_empty() {
Setting::Reset
} else {
Setting::Set(non_separator_tokens)
}
}
pub fn reset_separator_tokens(&mut self) {
self.separator_tokens = Setting::Reset;
}
pub fn set_separator_tokens(&mut self, separator_tokens: BTreeSet<String>) {
self.separator_tokens = if separator_tokens.is_empty() {
Setting::Reset
} else {
Setting::Set(separator_tokens)
}
}
pub fn reset_dictionary(&mut self) {
self.dictionary = Setting::Reset;
}
pub fn set_dictionary(&mut self, dictionary: BTreeSet<String>) {
self.dictionary =
if dictionary.is_empty() { Setting::Reset } else { Setting::Set(dictionary) }
}
pub fn reset_distinct_field(&mut self) {
self.distinct_field = Setting::Reset;
2021-04-07 11:53:57 +03:00
}
pub fn set_distinct_field(&mut self, distinct_field: String) {
self.distinct_field = Setting::Set(distinct_field);
}
2021-04-07 11:53:57 +03:00
pub fn reset_synonyms(&mut self) {
self.synonyms = Setting::Reset;
}
pub fn set_synonyms(&mut self, synonyms: BTreeMap<String, Vec<String>>) {
2021-06-16 18:33:33 +02:00
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
}
pub fn reset_primary_key(&mut self) {
self.primary_key = Setting::Reset;
}
pub fn set_primary_key(&mut self, primary_key: String) {
self.primary_key = Setting::Set(primary_key);
}
2022-03-16 10:03:18 +01:00
pub fn set_autorize_typos(&mut self, val: bool) {
self.authorize_typos = Setting::Set(val);
}
pub fn reset_authorize_typos(&mut self) {
self.authorize_typos = Setting::Reset;
}
pub fn set_min_word_len_two_typos(&mut self, val: u8) {
self.min_word_len_two_typos = Setting::Set(val);
2022-03-21 13:03:06 +01:00
}
pub fn reset_min_word_len_two_typos(&mut self) {
self.min_word_len_two_typos = Setting::Reset;
2022-03-21 13:03:06 +01:00
}
pub fn set_min_word_len_one_typo(&mut self, val: u8) {
self.min_word_len_one_typo = Setting::Set(val);
2022-03-21 13:03:06 +01:00
}
pub fn reset_min_word_len_one_typo(&mut self) {
self.min_word_len_one_typo = Setting::Reset;
2022-03-21 13:03:06 +01:00
}
2022-03-22 09:55:49 +01:00
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
2022-03-21 14:03:31 +01:00
self.exact_words = Setting::Set(words);
}
pub fn reset_exact_words(&mut self) {
self.exact_words = Setting::Reset;
}
2022-03-22 19:07:59 +01:00
pub fn set_exact_attributes(&mut self, attrs: HashSet<String>) {
self.exact_attributes = Setting::Set(attrs);
}
pub fn reset_exact_attributes(&mut self) {
self.exact_attributes = Setting::Reset;
}
pub fn set_max_values_per_facet(&mut self, value: usize) {
self.max_values_per_facet = Setting::Set(value);
}
pub fn reset_max_values_per_facet(&mut self) {
self.max_values_per_facet = Setting::Reset;
}
pub fn set_sort_facet_values_by(&mut self, value: OrderByMap) {
self.sort_facet_values_by = Setting::Set(value);
}
pub fn reset_sort_facet_values_by(&mut self) {
self.sort_facet_values_by = Setting::Reset;
}
pub fn set_pagination_max_total_hits(&mut self, value: usize) {
self.pagination_max_total_hits = Setting::Set(value);
}
pub fn reset_pagination_max_total_hits(&mut self) {
self.pagination_max_total_hits = Setting::Reset;
}
pub fn set_proximity_precision(&mut self, value: ProximityPrecision) {
self.proximity_precision = Setting::Set(value);
}
pub fn reset_proximity_precision(&mut self) {
self.proximity_precision = Setting::Reset;
}
pub fn set_embedder_settings(&mut self, value: BTreeMap<String, Setting<EmbeddingSettings>>) {
self.embedder_settings = Setting::Set(value);
}
pub fn reset_embedder_settings(&mut self) {
self.embedder_settings = Setting::Reset;
}
2024-03-11 18:24:21 +01:00
pub fn set_search_cutoff(&mut self, value: u64) {
self.search_cutoff = Setting::Set(value);
}
pub fn reset_search_cutoff(&mut self) {
self.search_cutoff = Setting::Reset;
}
pub fn set_localized_attributes_rules(&mut self, value: Vec<LocalizedAttributesRule>) {
self.localized_attributes_rules = Setting::Set(value);
}
pub fn reset_localized_attributes_rules(&mut self) {
self.localized_attributes_rules = Setting::Reset;
}
2024-11-19 15:57:56 +01:00
pub fn set_prefix_search(&mut self, value: PrefixSearch) {
self.prefix_search = Setting::Set(value);
}
pub fn reset_prefix_search(&mut self) {
self.prefix_search = Setting::Reset;
}
pub fn set_facet_search(&mut self, value: bool) {
self.facet_search = Setting::Set(value);
}
pub fn reset_facet_search(&mut self) {
self.facet_search = Setting::Reset;
}
2024-01-23 09:42:48 +01:00
#[tracing::instrument(
level = "trace"
skip(self, progress_callback, should_abort, settings_diff),
2024-01-23 09:42:48 +01:00
target = "indexing::documents"
)]
fn reindex<FP, FA>(
&mut self,
progress_callback: &FP,
should_abort: &FA,
settings_diff: InnerIndexSettingsDiff,
) -> Result<()>
where
FP: Fn(UpdateIndexingStep) + Sync,
FA: Fn() -> bool + Sync,
{
// if the settings are set before any document update, we don't need to do anything, and
// will set the primary key during the first document addition.
if self.index.number_of_documents(self.wtxn)? == 0 {
return Ok(());
}
let transform = Transform::new(
2022-03-23 17:28:41 +01:00
self.wtxn,
self.index,
self.indexer_config,
IndexDocumentsMethod::ReplaceDocuments,
false,
2022-03-23 17:28:41 +01:00
)?;
// We clear the databases and remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?;
2023-12-07 23:05:26 +01:00
// We index the generated `TransformOutput` which must contain
// all the documents with fields in the newly defined searchable order.
let indexing_builder = IndexDocuments::new(
self.wtxn,
self.index,
self.indexer_config,
IndexDocumentsConfig::default(),
&progress_callback,
&should_abort,
2022-03-23 17:28:41 +01:00
)?;
2023-12-07 23:05:26 +01:00
indexing_builder.execute_raw(output)?;
Ok(())
}
2020-11-11 17:08:18 +01:00
fn update_displayed(&mut self) -> Result<bool> {
match self.displayed_fields {
Setting::Set(ref fields) => {
// fields are deduplicated, only the first occurrence is taken into account
2021-06-16 18:33:33 +02:00
let names: Vec<_> = fields.iter().unique().map(String::as_str).collect();
self.index.put_displayed_fields(self.wtxn, &names)?;
}
2021-06-16 18:33:33 +02:00
Setting::Reset => {
self.index.delete_displayed_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
}
fn update_distinct_field(&mut self) -> Result<bool> {
match self.distinct_field {
Setting::Set(ref attr) => {
self.index.put_distinct_field(self.wtxn, attr)?;
}
2021-06-16 18:33:33 +02:00
Setting::Reset => {
self.index.delete_distinct_field(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
}
/// Updates the index's searchable attributes.
fn update_searchable(&mut self) -> Result<bool> {
match self.searchable_fields {
Setting::Set(ref fields) => {
// Check to see if the searchable fields changed before doing anything else
let old_fields = self.index.searchable_fields(self.wtxn)?;
let did_change = {
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
new_fields != old_fields
};
if !did_change {
return Ok(false);
}
// Since we're updating the settings we can only add new fields at the end of the field id map
2024-05-15 15:02:26 +02:00
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account
2021-06-16 18:33:33 +02:00
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one
for name in names.iter() {
// The fields ids map won't change the field id of already present elements thus only the
// new fields will be inserted.
2024-05-15 15:02:26 +02:00
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn,
&names,
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
2024-05-15 15:02:26 +02:00
&fields_ids_map,
)?;
2024-05-15 15:02:26 +02:00
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
Ok(true)
}
Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?),
Setting::NotSet => Ok(false),
}
}
fn update_stop_words(&mut self) -> Result<bool> {
match self.stop_words {
Setting::Set(ref stop_words) => {
let current = self.index.stop_words(self.wtxn)?;
2023-06-28 18:52:32 +02:00
// Apply an unlossy normalization on stop_words
let stop_words: BTreeSet<String> = stop_words
2023-06-28 18:52:32 +02:00
.iter()
.map(|w| w.as_str().normalize(&Default::default()).into_owned())
.collect();
2023-06-28 18:52:32 +02:00
// since we can't compare a BTreeSet with an FST we are going to convert the
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
let fst = fst::Set::from_iter(stop_words.into_iter())?;
// Does the new FST differ from the previous one?
2021-06-16 18:33:33 +02:00
if current
.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes())
{
// we want to re-create our FST.
self.index.put_stop_words(self.wtxn, &fst)?;
Ok(true)
} else {
Ok(false)
}
}
Setting::Reset => Ok(self.index.delete_stop_words(self.wtxn)?),
Setting::NotSet => Ok(false),
}
}
2023-07-24 17:00:18 +02:00
fn update_non_separator_tokens(&mut self) -> Result<bool> {
2023-07-25 15:01:42 +02:00
let changes = match self.non_separator_tokens {
2023-07-24 17:00:18 +02:00
Setting::Set(ref non_separator_tokens) => {
let current = self.index.non_separator_tokens(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != non_separator_tokens) {
2023-07-24 18:42:26 +02:00
self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?;
2023-07-25 15:01:42 +02:00
true
2023-07-24 17:00:18 +02:00
} else {
2023-07-25 15:01:42 +02:00
false
2023-07-24 17:00:18 +02:00
}
}
2023-07-25 15:01:42 +02:00
Setting::Reset => self.index.delete_non_separator_tokens(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if non separator tokens have been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
2023-07-24 17:00:18 +02:00
}
2023-07-25 15:01:42 +02:00
Ok(changes)
2023-07-24 17:00:18 +02:00
}
fn update_separator_tokens(&mut self) -> Result<bool> {
2023-07-25 15:01:42 +02:00
let changes = match self.separator_tokens {
2023-07-24 17:00:18 +02:00
Setting::Set(ref separator_tokens) => {
let current = self.index.separator_tokens(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != separator_tokens) {
2023-07-24 18:42:26 +02:00
self.index.put_separator_tokens(self.wtxn, separator_tokens)?;
2023-07-25 15:01:42 +02:00
true
2023-07-24 17:00:18 +02:00
} else {
2023-07-25 15:01:42 +02:00
false
2023-07-24 17:00:18 +02:00
}
}
2023-07-25 15:01:42 +02:00
Setting::Reset => self.index.delete_separator_tokens(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if separator tokens have been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
2023-07-24 17:00:18 +02:00
}
2023-07-25 15:01:42 +02:00
Ok(changes)
2023-07-24 17:00:18 +02:00
}
fn update_dictionary(&mut self) -> Result<bool> {
2023-07-25 15:01:42 +02:00
let changes = match self.dictionary {
2023-07-24 17:00:18 +02:00
Setting::Set(ref dictionary) => {
let current = self.index.dictionary(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != dictionary) {
2023-07-24 18:42:26 +02:00
self.index.put_dictionary(self.wtxn, dictionary)?;
2023-07-25 15:01:42 +02:00
true
2023-07-24 17:00:18 +02:00
} else {
2023-07-25 15:01:42 +02:00
false
2023-07-24 17:00:18 +02:00
}
}
2023-07-25 15:01:42 +02:00
Setting::Reset => self.index.delete_dictionary(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if dictionary has been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
2023-07-24 17:00:18 +02:00
}
2023-07-25 15:01:42 +02:00
Ok(changes)
2023-07-24 17:00:18 +02:00
}
fn update_synonyms(&mut self) -> Result<bool> {
2021-04-07 11:53:57 +03:00
match self.synonyms {
Setting::Set(ref user_synonyms) => {
fn normalize(tokenizer: &Tokenizer<'_>, text: &str) -> Vec<String> {
2022-06-02 15:47:28 +02:00
tokenizer
.tokenize(text)
2021-06-16 18:33:33 +02:00
.filter_map(|token| {
2023-09-04 14:39:52 +02:00
if token.is_word() && !token.lemma().is_empty() {
2022-06-02 15:47:28 +02:00
Some(token.lemma().to_string())
2021-06-16 18:33:33 +02:00
} else {
None
}
})
.collect::<Vec<_>>()
}
2021-04-07 11:53:57 +03:00
2022-06-02 15:47:28 +02:00
let mut builder = TokenizerBuilder::new();
2021-04-07 11:53:57 +03:00
let stop_words = self.index.stop_words(self.wtxn)?;
2022-06-02 15:47:28 +02:00
if let Some(ref stop_words) = stop_words {
builder.stop_words(stop_words);
2021-04-07 11:53:57 +03:00
}
2023-07-25 15:01:42 +02:00
let separators = self.index.allowed_separators(self.wtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
builder.separators(separators);
}
let dictionary = self.index.dictionary(self.wtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
builder.words_dict(dictionary);
}
2022-06-02 15:47:28 +02:00
let tokenizer = builder.build();
2021-04-07 11:53:57 +03:00
let mut new_synonyms = HashMap::new();
for (word, synonyms) in user_synonyms {
// Normalize both the word and associated synonyms.
2022-06-02 15:47:28 +02:00
let normalized_word = normalize(&tokenizer, word);
2023-08-08 16:52:36 +02:00
let normalized_synonyms: Vec<_> = synonyms
.iter()
.map(|synonym| normalize(&tokenizer, synonym))
.filter(|synonym| !synonym.is_empty())
.collect();
// Store the normalized synonyms under the normalized word,
// merging the possible duplicate words.
2023-08-08 16:52:36 +02:00
if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
entry.extend(normalized_synonyms.into_iter());
}
}
2021-04-07 11:53:57 +03:00
// Make sure that we don't have duplicate synonyms.
2021-06-16 18:33:33 +02:00
new_synonyms.iter_mut().for_each(|(_, synonyms)| {
synonyms.sort_unstable();
synonyms.dedup();
});
let old_synonyms = self.index.synonyms(self.wtxn)?;
2021-04-07 11:53:57 +03:00
if new_synonyms != old_synonyms {
2023-07-27 14:21:19 +02:00
self.index.put_synonyms(self.wtxn, &new_synonyms, user_synonyms)?;
2021-04-07 11:53:57 +03:00
Ok(true)
} else {
Ok(false)
}
}
Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?),
Setting::NotSet => Ok(false),
}
}
2022-03-22 19:07:59 +01:00
fn update_exact_attributes(&mut self) -> Result<bool> {
match self.exact_attributes {
Setting::Set(ref attrs) => {
let old_attrs = self.index.exact_attributes(self.wtxn)?;
let old_attrs = old_attrs.into_iter().map(String::from).collect::<HashSet<_>>();
if attrs != &old_attrs {
let attrs = attrs.iter().map(String::as_str).collect::<Vec<_>>();
self.index.put_exact_attributes(self.wtxn, &attrs)?;
Ok(true)
} else {
Ok(false)
}
2022-03-22 19:07:59 +01:00
}
Setting::Reset => Ok(self.index.delete_exact_attributes(self.wtxn)?),
2022-03-22 19:07:59 +01:00
Setting::NotSet => Ok(false),
}
}
fn update_filterable(&mut self) -> Result<()> {
match self.filterable_fields {
Setting::Set(ref fields) => {
let mut new_facets = HashSet::new();
for name in fields {
new_facets.insert(name.clone());
}
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
}
2021-06-16 18:33:33 +02:00
Setting::Reset => {
self.index.delete_filterable_fields(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
2021-08-23 11:37:18 +02:00
fn update_sortable(&mut self) -> Result<()> {
match self.sortable_fields {
Setting::Set(ref fields) => {
let mut new_fields = HashSet::new();
for name in fields {
new_fields.insert(name.clone());
}
self.index.put_sortable_fields(self.wtxn, &new_fields)?;
}
Setting::Reset => {
self.index.delete_sortable_fields(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
fn update_criteria(&mut self) -> Result<()> {
2023-01-11 12:14:17 +01:00
match &self.criteria {
Setting::Set(criteria) => {
self.index.put_criteria(self.wtxn, criteria)?;
2020-12-04 12:02:22 +01:00
}
2021-06-16 18:33:33 +02:00
Setting::Reset => {
self.index.delete_criteria(self.wtxn)?;
}
Setting::NotSet => (),
2020-12-04 12:02:22 +01:00
}
Ok(())
}
fn update_primary_key(&mut self) -> Result<()> {
match self.primary_key {
Setting::Set(ref primary_key) => {
if self.index.number_of_documents(self.wtxn)? == 0 {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
self.index.put_primary_key(self.wtxn, primary_key)?;
Ok(())
} else {
let curr_primary_key = self.index.primary_key(self.wtxn)?.unwrap().to_string();
if primary_key == &curr_primary_key {
Ok(())
} else {
Err(UserError::PrimaryKeyCannotBeChanged(curr_primary_key).into())
}
}
2021-06-16 18:33:33 +02:00
}
Setting::Reset => {
if self.index.number_of_documents(self.wtxn)? == 0 {
self.index.delete_primary_key(self.wtxn)?;
Ok(())
} else {
2021-10-26 17:49:35 +02:00
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
}
2021-06-16 18:33:33 +02:00
}
Setting::NotSet => Ok(()),
}
}
2022-03-16 10:03:18 +01:00
fn update_authorize_typos(&mut self) -> Result<()> {
match self.authorize_typos {
Setting::Set(flag) => {
self.index.put_authorize_typos(self.wtxn, flag)?;
Ok(())
}
Setting::Reset => {
self.index.put_authorize_typos(self.wtxn, true)?;
Ok(())
}
Setting::NotSet => Ok(()),
}
}
2022-03-21 13:03:06 +01:00
fn update_min_typo_word_len(&mut self) -> Result<()> {
let one = self.min_word_len_one_typo.or_reset(DEFAULT_MIN_WORD_LEN_ONE_TYPO);
let two = self.min_word_len_two_typos.or_reset(DEFAULT_MIN_WORD_LEN_TWO_TYPOS);
match (one, two) {
2022-03-21 13:03:06 +01:00
(Setting::Set(one), Setting::Set(two)) => {
2022-03-31 13:50:18 +02:00
if one > two {
2022-03-31 18:44:51 +02:00
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
2022-03-31 13:50:18 +02:00
} else {
self.index.put_min_word_len_one_typo(self.wtxn, one)?;
self.index.put_min_word_len_two_typos(self.wtxn, two)?;
2022-03-21 13:03:06 +01:00
}
}
(Setting::Set(one), _) => {
let two = self.index.min_word_len_two_typos(self.wtxn)?;
2022-03-31 18:44:51 +02:00
if one > two {
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
2022-03-31 13:50:18 +02:00
} else {
self.index.put_min_word_len_one_typo(self.wtxn, one)?;
2022-03-21 13:03:06 +01:00
}
}
(_, Setting::Set(two)) => {
let one = self.index.min_word_len_one_typo(self.wtxn)?;
2022-03-31 18:44:51 +02:00
if one > two {
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
2022-03-31 13:50:18 +02:00
} else {
self.index.put_min_word_len_two_typos(self.wtxn, two)?;
2022-03-21 13:03:06 +01:00
}
}
_ => (),
}
Ok(())
}
2022-03-21 14:03:31 +01:00
fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words {
Setting::Set(ref mut words) => {
fn normalize(tokenizer: &Tokenizer<'_>, text: &str) -> String {
2022-06-02 15:47:28 +02:00
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
2022-04-21 14:09:33 +02:00
}
2022-06-02 15:47:28 +02:00
let mut builder = TokenizerBuilder::new();
2022-04-21 14:09:33 +02:00
let stop_words = self.index.stop_words(self.wtxn)?;
2022-06-02 15:47:28 +02:00
if let Some(ref stop_words) = stop_words {
builder.stop_words(stop_words);
2022-04-21 14:09:33 +02:00
}
2022-06-02 15:47:28 +02:00
let tokenizer = builder.build();
2022-04-21 14:09:33 +02:00
let mut words: Vec<_> =
2022-06-02 15:47:28 +02:00
words.iter().map(|word| normalize(&tokenizer, word)).collect();
2022-04-21 14:09:33 +02:00
// normalization could reorder words
words.sort_unstable();
2022-03-22 09:55:49 +01:00
let words = fst::Set::from_iter(words.iter())?;
self.index.put_exact_words(self.wtxn, &words)?;
2022-03-21 14:03:31 +01:00
}
Setting::Reset => {
self.index.put_exact_words(self.wtxn, &fst::Set::default())?;
2022-03-21 14:03:31 +01:00
}
Setting::NotSet => (),
}
Ok(())
}
fn update_max_values_per_facet(&mut self) -> Result<()> {
match self.max_values_per_facet {
Setting::Set(max) => {
2023-11-27 11:52:22 +01:00
self.index.put_max_values_per_facet(self.wtxn, max as u64)?;
}
Setting::Reset => {
self.index.delete_max_values_per_facet(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
fn update_sort_facet_values_by(&mut self) -> Result<()> {
match self.sort_facet_values_by.as_ref() {
Setting::Set(value) => {
self.index.put_sort_facet_values_by(self.wtxn, value)?;
}
Setting::Reset => {
self.index.delete_sort_facet_values_by(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
fn update_pagination_max_total_hits(&mut self) -> Result<()> {
match self.pagination_max_total_hits {
Setting::Set(max) => {
2023-11-27 11:52:22 +01:00
self.index.put_pagination_max_total_hits(self.wtxn, max as u64)?;
}
Setting::Reset => {
self.index.delete_pagination_max_total_hits(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
}
fn update_proximity_precision(&mut self) -> Result<bool> {
let changed = match self.proximity_precision {
Setting::Set(new) => {
let old = self.index.proximity_precision(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_proximity_precision(self.wtxn, new)?;
2024-11-19 15:57:56 +01:00
old.is_some() || new != ProximityPrecision::default()
}
}
Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
2024-11-19 15:57:56 +01:00
fn update_prefix_search(&mut self) -> Result<bool> {
let changed = match self.prefix_search {
Setting::Set(new) => {
let old = self.index.prefix_search(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_prefix_search(self.wtxn, new)?;
old.is_some() || new != PrefixSearch::default()
}
}
Setting::Reset => self.index.delete_prefix_search(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
fn update_facet_search(&mut self) -> Result<bool> {
let changed = match self.facet_search {
Setting::Set(new) => {
let old = self.index.facet_search(self.wtxn)?;
if old == new {
false
} else {
self.index.put_facet_search(self.wtxn, new)?;
true
}
}
Setting::Reset => self.index.delete_facet_search(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => self.update_embedding_configs_set(configs),
Setting::Reset => {
// all vectors should be written back to documents
let old_configs = self.index.embedding_configs(self.wtxn)?;
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| -> Result<_> {
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
Ok((
name,
EmbedderAction::with_write_back(
WriteBackToDocuments { embedder_id, user_provided },
config.quantized(),
),
))
})
.collect();
let remove_all = remove_all?;
self.index.embedder_category_id.clear(self.wtxn)?;
self.index.delete_embedding_configs(self.wtxn)?;
Ok(remove_all)
}
Setting::NotSet => Ok(Default::default()),
}
}
fn update_embedding_configs_set(
&mut self,
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
) -> Result<BTreeMap<String, EmbedderAction>> {
use crate::vector::settings::SettingsDiff;
let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
(name, (config.into(), user_provided))
})
.collect();
let mut updated_configs = BTreeMap::new();
let mut embedder_actions = BTreeMap::new();
for joined in old_configs
.into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
let was_quantized = old.binary_quantized.set().unwrap_or_default();
let settings_diff = SettingsDiff::from_settings(&name, old, new)?;
match settings_diff {
SettingsDiff::Remove => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"removing embedder"
);
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
// free id immediately
self.index.embedder_category_id.delete(self.wtxn, &name)?;
embedder_actions.insert(
name,
EmbedderAction::with_write_back(
WriteBackToDocuments { embedder_id, user_provided },
was_quantized,
),
);
}
SettingsDiff::Reindex { action, updated_settings, quantize } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
?action,
"reindex embedder"
);
embedder_actions.insert(
name.clone(),
EmbedderAction::with_reindex(action, was_quantized)
.with_is_being_quantized(quantize),
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"update without reindex embedder"
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
if quantize {
embedder_actions.insert(
name.clone(),
EmbedderAction::default().with_is_being_quantized(true),
);
}
updated_configs.insert(name, (new, user_provided));
}
}
}
// unchanged config
EitherOrBoth::Left((name, (setting, user_provided))) => {
tracing::debug!(embedder = name, "unchanged embedder");
updated_configs.insert(name, (Setting::Set(setting), user_provided));
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
tracing::debug!(embedder = name, "new embedder");
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
embedder_actions.insert(
name.clone(),
EmbedderAction::with_reindex(ReindexAction::FullReindex, false),
);
updated_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
for res in self.index.embedder_category_id.iter(self.wtxn)? {
let (_name, id) = res?;
free_indices[id as usize] = false;
}
let mut free_indices = free_indices.iter_mut().enumerate();
let mut find_free_index =
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
for (name, action) in embedder_actions.iter() {
// ignore actions that are not possible for a new embedder
if matches!(action.reindex(), Some(ReindexAction::FullReindex))
&& self.index.embedder_category_id.get(self.wtxn, name)?.is_none()
{
let id =
find_free_index().ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
}
2024-04-03 11:19:45 +02:00
}
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
})
.collect();
if updated_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
}
Ok(embedder_actions)
}
2024-03-11 18:24:21 +01:00
fn update_search_cutoff(&mut self) -> Result<bool> {
let changed = match self.search_cutoff {
Setting::Set(new) => {
let old = self.index.search_cutoff(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_search_cutoff(self.wtxn, new)?;
true
}
}
Setting::Reset => self.index.delete_search_cutoff(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
2024-07-25 10:16:23 +02:00
fn update_localized_attributes_rules(&mut self) -> Result<()> {
match &self.localized_attributes_rules {
Setting::Set(new) => {
let old = self.index.localized_attributes_rules(self.wtxn)?;
2024-07-25 10:16:23 +02:00
if old.as_ref() != Some(new) {
self.index.put_localized_attributes_rules(self.wtxn, new.clone())?;
}
}
2024-07-25 10:16:23 +02:00
Setting::Reset => {
self.index.delete_localized_attributes_rules(self.wtxn)?;
}
Setting::NotSet => (),
}
2024-07-25 10:16:23 +02:00
Ok(())
}
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
2021-06-16 18:33:33 +02:00
where
FP: Fn(UpdateIndexingStep) + Sync,
FA: Fn() -> bool + Sync,
{
2022-02-15 11:41:55 +01:00
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
// never trigger re-indexing
self.update_displayed()?;
self.update_distinct_field()?;
self.update_criteria()?;
self.update_primary_key()?;
2022-03-16 10:03:18 +01:00
self.update_authorize_typos()?;
2022-03-21 13:03:06 +01:00
self.update_min_typo_word_len()?;
2022-03-21 14:03:31 +01:00
self.update_exact_words()?;
self.update_max_values_per_facet()?;
self.update_sort_facet_values_by()?;
self.update_pagination_max_total_hits()?;
self.update_search_cutoff()?;
// could trigger re-indexing
self.update_filterable()?;
self.update_sortable()?;
self.update_stop_words()?;
self.update_non_separator_tokens()?;
self.update_separator_tokens()?;
self.update_dictionary()?;
self.update_synonyms()?;
self.update_searchable()?;
self.update_exact_attributes()?;
self.update_proximity_precision()?;
2024-11-19 15:57:56 +01:00
self.update_prefix_search()?;
self.update_facet_search()?;
self.update_localized_attributes_rules()?;
let embedding_config_updates = self.update_embedding_configs()?;
2022-03-22 19:07:59 +01:00
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
let primary_key_id = self
.index
.primary_key(self.wtxn)?
.and_then(|name| new_inner_settings.fields_ids_map.id(name));
let settings_update_only = true;
let inner_settings_diff = InnerIndexSettingsDiff::new(
old_inner_settings,
new_inner_settings,
primary_key_id,
embedding_config_updates,
settings_update_only,
);
2024-03-11 18:24:21 +01:00
if inner_settings_diff.any_reindexing_needed() {
self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
}
Ok(())
}
}
2024-04-03 11:19:45 +02:00
pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>,
pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
2024-04-03 11:19:45 +02:00
pub(crate) settings_update_only: bool,
/// The set of only the additional searchable fields.
/// If any other searchable field has been modified, is set to None.
pub(crate) only_additional_fields: Option<HashSet<String>>,
// Cache the check to see if all the stop_words, allowed_separators, dictionary,
// exact_attributes, proximity_precision are different.
pub(crate) cache_reindex_searchable_without_user_defined: bool,
// Cache the check to see if the user_defined_searchables are different.
pub(crate) cache_user_defined_searchables: bool,
// Cache the check to see if the exact_attributes are different.
pub(crate) cache_exact_attributes: bool,
}
impl InnerIndexSettingsDiff {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")]
pub(crate) fn new(
old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>,
2024-09-02 12:58:09 +02:00
mut embedding_config_updates: BTreeMap<String, EmbedderAction>,
settings_update_only: bool,
) -> Self {
let only_additional_fields = match (
&old_settings.user_defined_searchable_fields,
&new_settings.user_defined_searchable_fields,
) {
(None, None) | (Some(_), None) | (None, Some(_)) => None, // None means *
(Some(old), Some(new)) => {
let old: HashSet<_> = old.iter().cloned().collect();
let new: HashSet<_> = new.iter().cloned().collect();
if old.difference(&new).next().is_none() {
// if no field has been removed return only the additional ones
Some(&new - &old).filter(|x| !x.is_empty())
} else {
None
}
}
};
let cache_reindex_searchable_without_user_defined = {
old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
!= new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| old_settings.allowed_separators != new_settings.allowed_separators
|| old_settings.dictionary != new_settings.dictionary
|| old_settings.proximity_precision != new_settings.proximity_precision
2024-11-19 15:57:56 +01:00
|| old_settings.prefix_search != new_settings.prefix_search
|| old_settings.localized_searchable_fields_ids
!= new_settings.localized_searchable_fields_ids
};
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
!= new_settings.user_defined_searchable_fields;
2024-09-02 12:58:09 +02:00
// if the user-defined searchables changed, then we need to reindex prompts.
if cache_user_defined_searchables {
for (embedder_name, (config, _, _quantized)) in
new_settings.embedding_configs.inner_as_ref()
{
let was_quantized =
2024-09-19 16:18:36 +02:00
old_settings.embedding_configs.get(embedder_name).map_or(false, |conf| conf.2);
2024-09-02 12:58:09 +02:00
// skip embedders that don't use document templates
if !config.uses_document_template() {
continue;
}
// note: this could currently be entry.or_insert(..), but we're future-proofing with an explicit match
// this always makes the code clearer by explicitly handling the cases
match embedding_config_updates.entry(embedder_name.clone()) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(EmbedderAction::with_reindex(
ReindexAction::RegeneratePrompts,
was_quantized,
));
}
std::collections::btree_map::Entry::Occupied(entry) => {
let EmbedderAction {
was_quantized: _,
2024-09-19 16:18:36 +02:00
is_being_quantized: _,
write_back: _, // We are deleting this embedder, so no point in regeneration
reindex: _, // We are already fully reindexing
} = entry.get();
2024-09-02 12:58:09 +02:00
}
};
}
}
InnerIndexSettingsDiff {
old: old_settings,
new: new_settings,
primary_key_id,
embedding_config_updates,
settings_update_only,
only_additional_fields,
cache_reindex_searchable_without_user_defined,
cache_user_defined_searchables,
cache_exact_attributes,
}
}
2024-04-03 11:19:45 +02:00
pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
}
2024-04-03 11:19:45 +02:00
pub fn reindex_searchable(&self) -> bool {
self.cache_reindex_searchable_without_user_defined
|| self.cache_exact_attributes
|| self.cache_user_defined_searchables
}
pub fn reindex_proximities(&self) -> bool {
// if any searchable settings force the reindexing
(self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables)
// and if any settings needs the proximity database created
&& (self.old.proximity_precision == ProximityPrecision::ByAttribute
2024-06-05 16:00:24 +02:00
|| self.new.proximity_precision == ProximityPrecision::ByAttribute)
}
pub fn reindex_searchable_id(&self, id: FieldId) -> Option<DelAddOperation> {
if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes {
Some(DelAddOperation::DeletionAndAddition)
} else if let Some(only_additional_fields) = &self.only_additional_fields {
let additional_field = self.new.fields_ids_map.name(id).unwrap();
if only_additional_fields.contains(additional_field) {
Some(DelAddOperation::Addition)
} else {
None
}
} else if self.cache_user_defined_searchables {
Some(DelAddOperation::DeletionAndAddition)
} else {
None
}
}
2024-11-19 15:57:56 +01:00
pub fn facet_fids_changed(&self) -> bool {
2024-04-03 11:19:45 +02:00
let existing_fields = &self.new.existing_fields;
if existing_fields.iter().any(|field| field.contains('.')) {
return true;
}
2024-04-03 11:19:45 +02:00
let old_faceted_fields = &self.old.user_defined_faceted_fields;
if old_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed.
2024-04-03 11:19:45 +02:00
let new_faceted_fields = &self.new.user_defined_faceted_fields;
if new_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields)
2024-11-19 15:57:56 +01:00
}
pub fn global_facet_settings_changed(&self) -> bool {
self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids
|| self.old.facet_search != self.new.facet_search
}
pub fn reindex_facets(&self) -> bool {
self.facet_fids_changed() || self.global_facet_settings_changed()
}
2024-04-03 11:19:45 +02:00
pub fn reindex_vectors(&self) -> bool {
!self.embedding_config_updates.is_empty()
}
2024-04-03 11:19:45 +02:00
pub fn settings_update_only(&self) -> bool {
self.settings_update_only
}
pub fn run_geo_indexing(&self) -> bool {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
2024-05-29 11:26:07 +02:00
pub fn modified_faceted_fields(&self) -> HashSet<String> {
&self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields
}
}
#[derive(Clone)]
pub(crate) struct InnerIndexSettings {
pub stop_words: Option<fst::Set<Vec<u8>>>,
pub allowed_separators: Option<BTreeSet<String>>,
pub dictionary: Option<BTreeSet<String>>,
pub fields_ids_map: FieldsIdsMap,
pub user_defined_faceted_fields: HashSet<String>,
pub user_defined_searchable_fields: Option<Vec<String>>,
pub faceted_fields_ids: HashSet<FieldId>,
pub searchable_fields_ids: Vec<FieldId>,
pub exact_attributes: HashSet<FieldId>,
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
pub non_searchable_fields_ids: Vec<FieldId>,
pub non_faceted_fields_ids: Vec<FieldId>,
pub localized_searchable_fields_ids: LocalizedFieldIds,
pub localized_faceted_fields_ids: LocalizedFieldIds,
2024-11-19 15:57:56 +01:00
pub prefix_search: PrefixSearch,
pub facet_search: bool,
}
impl InnerIndexSettings {
pub fn from_index(
index: &Index,
rtxn: &heed::RoTxn<'_>,
embedding_configs: Option<EmbeddingConfigs>,
) -> Result<Self> {
let stop_words = index.stop_words(rtxn)?;
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?;
2024-05-21 13:39:46 +02:00
let mut fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = match embedding_configs {
Some(embedding_configs) => embedding_configs,
None => embedders(index.embedding_configs(rtxn)?)?,
};
2024-11-19 15:57:56 +01:00
let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default();
let facet_search = index.facet_search(rtxn)?;
let existing_fields: HashSet<_> = index
.field_distribution(rtxn)?
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let localized_attributes_rules = index.localized_attributes_rules(rtxn)?;
let localized_searchable_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields_ids.iter().cloned(),
);
let localized_faceted_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
faceted_fields_ids.iter().cloned(),
);
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
Ok(Self {
stop_words,
allowed_separators,
dictionary,
fields_ids_map,
user_defined_faceted_fields,
user_defined_searchable_fields,
faceted_fields_ids,
searchable_fields_ids,
exact_attributes,
proximity_precision,
embedding_configs,
existing_fields,
geo_fields_ids,
non_searchable_fields_ids: vectors_fids.clone(),
non_faceted_fields_ids: vectors_fids.clone(),
localized_searchable_fields_ids,
localized_faceted_fields_ids,
2024-11-19 15:57:56 +01:00
prefix_search,
facet_search,
})
}
// find and insert the new field ids
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> {
let new_facets = self
.fields_ids_map
.iter()
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|(_fid, field)| field.to_string())
.collect();
index.put_faceted_fields(wtxn, &new_facets)?;
self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?;
let localized_attributes_rules = index.localized_attributes_rules(wtxn)?;
self.localized_faceted_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&self.fields_ids_map,
self.faceted_fields_ids.iter().cloned(),
);
Ok(())
}
// find and insert the new field ids
pub fn recompute_searchables(
&mut self,
wtxn: &mut heed::RwTxn<'_>,
index: &Index,
) -> Result<()> {
2024-05-07 17:56:40 +02:00
let searchable_fields = self
.user_defined_searchable_fields
.as_ref()
.map(|searchable| searchable.iter().map(|s| s.as_str()).collect::<Vec<_>>());
// in case new fields were introduced we're going to recreate the searchable fields.
if let Some(searchable_fields) = searchable_fields {
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
&searchable_fields,
&self.non_searchable_fields_ids,
&self.fields_ids_map,
)?;
}
self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
let localized_attributes_rules = index.localized_attributes_rules(wtxn)?;
self.localized_searchable_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&self.fields_ids_map,
self.searchable_fields_ids.iter().cloned(),
);
Ok(())
}
}
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(
|IndexEmbeddingConfig {
name,
config: EmbeddingConfig { embedder_options, prompt, quantized },
..
}| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt, quantized.unwrap_or_default())))
},
)
.collect();
res.map(EmbeddingConfigs::new)
}
fn validate_prompt(
name: &str,
new_prompt: Setting<String>,
max_bytes: Setting<usize>,
) -> Result<Setting<String>> {
match new_prompt {
Setting::Set(template) => {
let max_bytes = match max_bytes.set() {
2024-08-27 17:52:09 +02:00
Some(max_bytes) => NonZeroUsize::new(max_bytes).ok_or_else(|| {
crate::error::UserError::InvalidSettingsDocumentTemplateMaxBytes {
embedder_name: name.to_owned(),
}
})?,
None => default_max_bytes(),
};
// validate
2024-08-27 17:52:09 +02:00
let template = crate::prompt::Prompt::new(
template,
// always specify a max_bytes
Some(max_bytes),
)
.map(|prompt| crate::prompt::PromptData::from(prompt).template)
.map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?;
Ok(Setting::Set(template))
}
new => Ok(new),
}
}
pub fn validate_embedding_settings(
settings: Setting<EmbeddingSettings>,
name: &str,
) -> Result<Setting<EmbeddingSettings>> {
let Setting::Set(settings) = settings else { return Ok(settings) };
2024-03-25 10:05:38 +01:00
let EmbeddingSettings {
source,
model,
revision,
2025-02-18 17:12:23 +01:00
pooling,
2024-03-25 10:05:38 +01:00
api_key,
dimensions,
document_template,
2024-08-27 17:52:09 +02:00
document_template_max_bytes,
2024-03-25 10:05:38 +01:00
url,
2024-07-16 16:01:26 +02:00
request,
response,
search_embedder,
mut indexing_embedder,
2024-03-27 11:50:22 +01:00
distribution,
2024-07-22 12:04:05 +02:00
headers,
binary_quantized: binary_quantize,
2024-03-25 10:05:38 +01:00
} = settings;
let document_template = validate_prompt(name, document_template, document_template_max_bytes)?;
if let Some(0) = dimensions.set() {
return Err(crate::error::UserError::InvalidSettingsDimensions {
embedder_name: name.to_owned(),
}
.into());
}
2024-03-25 10:05:58 +01:00
if let Some(url) = url.as_ref().set() {
url::Url::parse(url).map_err(|error| crate::error::UserError::InvalidUrl {
embedder_name: name.to_owned(),
inner_error: error,
url: url.to_owned(),
})?;
}
2024-07-16 16:01:26 +02:00
if let Some(request) = request.as_ref().set() {
let request = crate::vector::rest::Request::new(request.to_owned())
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
if let Some(response) = response.as_ref().set() {
crate::vector::rest::Response::new(response.to_owned(), &request)
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
}
}
let Some(inferred_source) = source.set() else {
// we are validating the fused settings, so we always have a source
return Ok(Setting::Set(EmbeddingSettings {
source,
model,
revision,
2025-02-18 17:12:23 +01:00
pooling,
api_key,
dimensions,
document_template,
2024-08-27 17:52:09 +02:00
document_template_max_bytes,
2024-03-25 10:05:38 +01:00
url,
2024-07-16 16:01:26 +02:00
request,
response,
search_embedder,
indexing_embedder,
2024-03-27 11:50:22 +01:00
distribution,
2024-07-22 12:04:05 +02:00
headers,
binary_quantized: binary_quantize,
}));
};
EmbeddingSettings::check_settings(
name,
inferred_source,
NestingContext::NotNested,
&model,
&revision,
&pooling,
&dimensions,
&api_key,
&url,
&request,
&response,
&document_template,
&document_template_max_bytes,
&headers,
&search_embedder,
&indexing_embedder,
&binary_quantize,
&distribution,
)?;
match inferred_source {
EmbedderSource::OpenAi => {
if let Setting::Set(model) = &model {
let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str())
.ok_or(crate::error::UserError::InvalidOpenAiModel {
embedder_name: name.to_owned(),
model: model.clone(),
})?;
if let Setting::Set(dimensions) = dimensions {
if !model.supports_overriding_dimensions()
&& dimensions != model.default_dimensions()
{
return Err(crate::error::UserError::InvalidOpenAiModelDimensions {
embedder_name: name.to_owned(),
model: model.name(),
dimensions,
expected_dimensions: model.default_dimensions(),
}
.into());
}
if dimensions > model.default_dimensions() {
return Err(crate::error::UserError::InvalidOpenAiModelDimensionsMax {
embedder_name: name.to_owned(),
model: model.name(),
dimensions,
max_dimensions: model.default_dimensions(),
}
.into());
}
}
}
}
EmbedderSource::Ollama
| EmbedderSource::HuggingFace
| EmbedderSource::UserProvided
| EmbedderSource::Rest => {}
EmbedderSource::Composite => {
if let Setting::Set(embedder) = &search_embedder {
if let Some(source) = embedder.source.set() {
let search_embedder = match embedder.search_embedder.clone() {
Setting::Set(search_embedder) => Setting::Set(deserialize_sub_embedder(
search_embedder,
name,
NestingContext::Search,
)?),
Setting::Reset => Setting::Reset,
Setting::NotSet => Setting::NotSet,
};
let indexing_embedder = match embedder.indexing_embedder.clone() {
Setting::Set(indexing_embedder) => Setting::Set(deserialize_sub_embedder(
indexing_embedder,
name,
NestingContext::Search,
)?),
Setting::Reset => Setting::Reset,
Setting::NotSet => Setting::NotSet,
};
EmbeddingSettings::check_nested_source(name, source, NestingContext::Search)?;
EmbeddingSettings::check_settings(
name,
source,
NestingContext::Search,
&embedder.model,
&embedder.revision,
&embedder.pooling,
&embedder.dimensions,
&embedder.api_key,
&embedder.url,
&embedder.request,
&embedder.response,
&embedder.document_template,
&embedder.document_template_max_bytes,
&embedder.headers,
&search_embedder,
&indexing_embedder,
&embedder.binary_quantized,
&embedder.distribution,
)?;
} else {
return Err(UserError::MissingSourceForNested {
embedder_name: NestingContext::Search.embedder_name_with_context(name),
}
.into());
}
}
indexing_embedder = if let Setting::Set(mut embedder) = indexing_embedder {
embedder.document_template = validate_prompt(
name,
embedder.document_template,
embedder.document_template_max_bytes,
)?;
if let Some(source) = embedder.source.set() {
let search_embedder = match embedder.search_embedder.clone() {
Setting::Set(search_embedder) => Setting::Set(deserialize_sub_embedder(
search_embedder,
name,
NestingContext::Indexing,
)?),
Setting::Reset => Setting::Reset,
Setting::NotSet => Setting::NotSet,
};
let indexing_embedder = match embedder.indexing_embedder.clone() {
Setting::Set(indexing_embedder) => Setting::Set(deserialize_sub_embedder(
indexing_embedder,
name,
NestingContext::Indexing,
)?),
Setting::Reset => Setting::Reset,
Setting::NotSet => Setting::NotSet,
};
EmbeddingSettings::check_nested_source(name, source, NestingContext::Indexing)?;
EmbeddingSettings::check_settings(
name,
source,
NestingContext::Indexing,
&embedder.model,
&embedder.revision,
&embedder.pooling,
&embedder.dimensions,
&embedder.api_key,
&embedder.url,
&embedder.request,
&embedder.response,
&embedder.document_template,
&embedder.document_template_max_bytes,
&embedder.headers,
&search_embedder,
&indexing_embedder,
&embedder.binary_quantized,
&embedder.distribution,
)?;
} else {
return Err(UserError::MissingSourceForNested {
embedder_name: NestingContext::Indexing.embedder_name_with_context(name),
}
.into());
}
Setting::Set(embedder)
} else {
indexing_embedder
};
}
}
Ok(Setting::Set(EmbeddingSettings {
source,
model,
revision,
2025-02-18 17:12:23 +01:00
pooling,
api_key,
dimensions,
document_template,
2024-08-27 17:52:09 +02:00
document_template_max_bytes,
2024-03-25 10:05:38 +01:00
url,
2024-07-16 16:01:26 +02:00
request,
response,
2025-02-24 13:51:46 +01:00
search_embedder,
indexing_embedder,
2024-03-27 11:50:22 +01:00
distribution,
2024-07-22 12:04:05 +02:00
headers,
binary_quantized: binary_quantize,
}))
}
2025-02-24 13:51:46 +01:00
fn deserialize_sub_embedder(
sub_embedder: serde_json::Value,
embedder_name: &str,
context: NestingContext,
) -> std::result::Result<SubEmbeddingSettings, UserError> {
match deserr::deserialize::<_, _, deserr::errors::JsonError>(sub_embedder) {
Ok(sub_embedder) => Ok(sub_embedder),
Err(error) => {
let message = format!("{error}{}", context.nesting_embedders());
Err(UserError::InvalidSettingsEmbedder {
embedder_name: context.embedder_name_with_context(embedder_name),
message,
2022-08-02 15:13:06 +02:00
})
}
}
}
2025-02-24 13:51:46 +01:00
#[cfg(test)]
#[path = "test_settings.rs"]
mod tests;