Merge branch 'main' into tmp-release-v1.7.4

This commit is contained in:
Louis Dureuil 2024-03-28 10:51:49 +01:00 committed by GitHub
commit 796213af9a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
68 changed files with 4625 additions and 1492 deletions

View file

@ -339,6 +339,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
prompt_reader: grenad::Reader<R>,
indexer: GrenadParameters,
embedder: Arc<Embedder>,
request_threads: &rayon::ThreadPool,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
@ -376,7 +377,10 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
if chunks.len() == chunks.capacity() {
let chunked_embeds = embedder
.embed_chunks(std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)))
.embed_chunks(
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
request_threads,
)
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?;
@ -394,7 +398,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
// send last chunk
if !chunks.is_empty() {
let chunked_embeds = embedder
.embed_chunks(std::mem::take(&mut chunks))
.embed_chunks(std::mem::take(&mut chunks), request_threads)
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?;
for (docid, embeddings) in chunks_ids
@ -408,7 +412,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
if !current_chunk.is_empty() {
let embeds = embedder
.embed_chunks(vec![std::mem::take(&mut current_chunk)])
.embed_chunks(vec![std::mem::take(&mut current_chunk)], request_threads)
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?;

View file

@ -238,6 +238,12 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = rayon::ThreadPoolBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
rayon::spawn(move || {
for (name, (embedder, prompt)) in embedders {
let result = extract_vector_points(
@ -249,7 +255,12 @@ fn send_original_documents_data(
);
match result {
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
let embeddings = match extract_embeddings(prompts, indexer, embedder.clone()) {
let embeddings = match extract_embeddings(
prompts,
indexer,
embedder.clone(),
&request_threads,
) {
Ok(results) => Some(results),
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));

View file

@ -2646,6 +2646,12 @@ mod tests {
api_key: Setting::NotSet,
dimensions: Setting::Set(3),
document_template: Setting::NotSet,
url: Setting::NotSet,
query: Setting::NotSet,
input_field: Setting::NotSet,
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
}),
);
settings.set_embedder_settings(embedders);

View file

@ -14,12 +14,13 @@ use super::IndexerConfig;
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldsIdsMap, Index, OrderBy, Result};
use crate::{FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub enum Setting<T> {
@ -145,10 +146,11 @@ pub struct Settings<'a, 't, 'i> {
/// Attributes on which typo tolerance is disabled.
exact_attributes: Setting<HashSet<String>>,
max_values_per_facet: Setting<usize>,
sort_facet_values_by: Setting<HashMap<String, OrderBy>>,
sort_facet_values_by: Setting<OrderByMap>,
pagination_max_total_hits: Setting<usize>,
proximity_precision: Setting<ProximityPrecision>,
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
search_cutoff: Setting<u64>,
}
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
@ -182,6 +184,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
pagination_max_total_hits: Setting::NotSet,
proximity_precision: Setting::NotSet,
embedder_settings: Setting::NotSet,
search_cutoff: Setting::NotSet,
indexer_config,
}
}
@ -340,7 +343,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.max_values_per_facet = Setting::Reset;
}
pub fn set_sort_facet_values_by(&mut self, value: HashMap<String, OrderBy>) {
pub fn set_sort_facet_values_by(&mut self, value: OrderByMap) {
self.sort_facet_values_by = Setting::Set(value);
}
@ -372,6 +375,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.embedder_settings = Setting::Reset;
}
pub fn set_search_cutoff(&mut self, value: u64) {
self.search_cutoff = Setting::Set(value);
}
pub fn reset_search_cutoff(&mut self) {
self.search_cutoff = Setting::Reset;
}
#[tracing::instrument(
level = "trace"
skip(self, progress_callback, should_abort, old_fields_ids_map),
@ -1025,6 +1036,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(update)
}
fn update_search_cutoff(&mut self) -> Result<bool> {
let changed = match self.search_cutoff {
Setting::Set(new) => {
let old = self.index.search_cutoff(self.wtxn)?;
if old == Some(new) {
false
} else {
self.index.put_search_cutoff(self.wtxn, new)?;
true
}
}
Setting::Reset => self.index.delete_search_cutoff(self.wtxn)?,
Setting::NotSet => false,
};
Ok(changed)
}
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
where
FP: Fn(UpdateIndexingStep) + Sync,
@ -1073,6 +1102,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?;
// never trigger re-indexing
self.update_search_cutoff()?;
if stop_words_updated
|| non_separator_tokens_updated
|| separator_tokens_updated
@ -1131,6 +1163,12 @@ fn validate_prompt(
api_key,
dimensions,
document_template: Setting::Set(template),
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
}) => {
// validate
let template = crate::prompt::Prompt::new(template)
@ -1144,6 +1182,12 @@ fn validate_prompt(
api_key,
dimensions,
document_template: Setting::Set(template),
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
}))
}
new => Ok(new),
@ -1156,8 +1200,20 @@ pub fn validate_embedding_settings(
) -> Result<Setting<EmbeddingSettings>> {
let settings = validate_prompt(name, settings)?;
let Setting::Set(settings) = settings else { return Ok(settings) };
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
settings;
let EmbeddingSettings {
source,
model,
revision,
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
} = settings;
if let Some(0) = dimensions.set() {
return Err(crate::error::UserError::InvalidSettingsDimensions {
@ -1166,6 +1222,14 @@ pub fn validate_embedding_settings(
.into());
}
if let Some(url) = url.as_ref().set() {
url::Url::parse(url).map_err(|error| crate::error::UserError::InvalidUrl {
embedder_name: name.to_owned(),
inner_error: error,
url: url.to_owned(),
})?;
}
let Some(inferred_source) = source.set() else {
return Ok(Setting::Set(EmbeddingSettings {
source,
@ -1174,11 +1238,25 @@ pub fn validate_embedding_settings(
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
}));
};
match inferred_source {
EmbedderSource::OpenAi => {
check_unset(&revision, "revision", inferred_source, name)?;
check_unset(&url, "url", inferred_source, name)?;
check_unset(&query, "query", inferred_source, name)?;
check_unset(&input_field, "inputField", inferred_source, name)?;
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
check_unset(&input_type, "inputType", inferred_source, name)?;
if let Setting::Set(model) = &model {
let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str())
.ok_or(crate::error::UserError::InvalidOpenAiModel {
@ -1209,9 +1287,30 @@ pub fn validate_embedding_settings(
}
}
}
EmbedderSource::Ollama => {
// Dimensions get inferred, only model name is required
check_unset(&dimensions, "dimensions", inferred_source, name)?;
check_set(&model, "model", inferred_source, name)?;
check_unset(&api_key, "apiKey", inferred_source, name)?;
check_unset(&revision, "revision", inferred_source, name)?;
check_unset(&url, "url", inferred_source, name)?;
check_unset(&query, "query", inferred_source, name)?;
check_unset(&input_field, "inputField", inferred_source, name)?;
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
check_unset(&input_type, "inputType", inferred_source, name)?;
}
EmbedderSource::HuggingFace => {
check_unset(&api_key, "apiKey", inferred_source, name)?;
check_unset(&dimensions, "dimensions", inferred_source, name)?;
check_unset(&url, "url", inferred_source, name)?;
check_unset(&query, "query", inferred_source, name)?;
check_unset(&input_field, "inputField", inferred_source, name)?;
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
check_unset(&input_type, "inputType", inferred_source, name)?;
}
EmbedderSource::UserProvided => {
check_unset(&model, "model", inferred_source, name)?;
@ -1219,6 +1318,18 @@ pub fn validate_embedding_settings(
check_unset(&api_key, "apiKey", inferred_source, name)?;
check_unset(&document_template, "documentTemplate", inferred_source, name)?;
check_set(&dimensions, "dimensions", inferred_source, name)?;
check_unset(&url, "url", inferred_source, name)?;
check_unset(&query, "query", inferred_source, name)?;
check_unset(&input_field, "inputField", inferred_source, name)?;
check_unset(&path_to_embeddings, "pathToEmbeddings", inferred_source, name)?;
check_unset(&embedding_object, "embeddingObject", inferred_source, name)?;
check_unset(&input_type, "inputType", inferred_source, name)?;
}
EmbedderSource::Rest => {
check_unset(&model, "model", inferred_source, name)?;
check_unset(&revision, "revision", inferred_source, name)?;
check_set(&url, "url", inferred_source, name)?;
}
}
Ok(Setting::Set(EmbeddingSettings {
@ -1228,6 +1339,12 @@ pub fn validate_embedding_settings(
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
}))
}
@ -2050,6 +2167,7 @@ mod tests {
pagination_max_total_hits,
proximity_precision,
embedder_settings,
search_cutoff,
} = settings;
assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet));
@ -2073,6 +2191,7 @@ mod tests {
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
assert!(matches!(proximity_precision, Setting::NotSet));
assert!(matches!(embedder_settings, Setting::NotSet));
assert!(matches!(search_cutoff, Setting::NotSet));
})
.unwrap();
}