mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Small commit to add hybrid search and autoembedding
This commit is contained in:
parent
21bcf32109
commit
13c2c6c16b
42 changed files with 4045 additions and 246 deletions
|
@ -180,6 +180,14 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||
UnknownInternalDocumentId { document_id: DocumentId },
|
||||
#[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")]
|
||||
InvalidMinTypoWordLenSetting(u8, u8),
|
||||
#[error(transparent)]
|
||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||
#[error(transparent)]
|
||||
MissingDocumentField(#[from] crate::prompt::error::RenderPromptError),
|
||||
#[error(transparent)]
|
||||
InvalidPrompt(#[from] crate::prompt::error::NewPromptError),
|
||||
#[error("Invalid prompt in for embeddings with name '{0}': {1}")]
|
||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
|
@ -336,6 +344,26 @@ impl From<HeedError> for Error {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum FaultSource {
|
||||
User,
|
||||
Runtime,
|
||||
Bug,
|
||||
Undecided,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FaultSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
FaultSource::User => "user error",
|
||||
FaultSource::Runtime => "runtime error",
|
||||
FaultSource::Bug => "coding error",
|
||||
FaultSource::Undecided => "error",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn conditionally_lookup_for_error_message() {
|
||||
let prefix = "Attribute `name` is not sortable.";
|
||||
|
|
|
@ -23,6 +23,7 @@ use crate::heed_codec::{
|
|||
};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::readable_slices::ReadableSlices;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||
|
@ -74,6 +75,7 @@ pub mod main_key {
|
|||
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
|
||||
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
||||
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
||||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
|
@ -1528,6 +1530,33 @@ impl Index {
|
|||
|
||||
Ok(script_language)
|
||||
}
|
||||
|
||||
pub(crate) fn put_embedding_configs(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
configs: Vec<(String, EmbeddingConfig)>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
|
||||
wtxn,
|
||||
main_key::EMBEDDING_CONFIGS,
|
||||
&configs,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
|
||||
}
|
||||
|
||||
pub fn embedding_configs(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
|
||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
@ -17,11 +17,13 @@ pub mod facet;
|
|||
mod fields_ids_map;
|
||||
pub mod heed_codec;
|
||||
pub mod index;
|
||||
pub mod prompt;
|
||||
pub mod proximity;
|
||||
mod readable_slices;
|
||||
pub mod score_details;
|
||||
mod search;
|
||||
pub mod update;
|
||||
pub mod vector;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
|
@ -37,8 +39,8 @@ pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
|||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
pub use search::new::{
|
||||
execute_search, DefaultSearchLogger, GeoSortStrategy, SearchContext, SearchLogger,
|
||||
VisualSearchLogger,
|
||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
|
||||
SearchLogger, VisualSearchLogger,
|
||||
};
|
||||
use serde_json::Value;
|
||||
pub use {charabia as tokenizer, heed};
|
||||
|
@ -60,7 +62,7 @@ pub use self::index::Index;
|
|||
pub use self::search::{
|
||||
FacetDistribution, FacetValueHit, Filter, FormatOptions, MatchBounds, MatcherBuilder,
|
||||
MatchingWords, OrderBy, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy,
|
||||
DEFAULT_VALUES_PER_FACET,
|
||||
VectorQuery, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
|
97
milli/src/prompt/context.rs
Normal file
97
milli/src/prompt/context.rs
Normal file
|
@ -0,0 +1,97 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use super::document::Document;
|
||||
use super::fields::Fields;
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Context<'a> {
|
||||
document: &'a Document<'a>,
|
||||
fields: Fields<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Context<'a> {
|
||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
|
||||
Self { document, fields: Fields::new(document, field_id_map) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for Context<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(self.document.as_value())
|
||||
.chain(std::iter::once(self.fields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(self.document.as_value()),
|
||||
"fields" => Some(self.fields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Context<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
131
milli/src/prompt/document.rs
Normal file
131
milli/src/prompt/document.rs
Normal file
|
@ -0,0 +1,131 @@
|
|||
use std::cell::OnceCell;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use liquid::model::{
|
||||
DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Document<'a>(BTreeMap<&'a str, (&'a [u8], ParsedValue)>);
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ParsedValue(std::cell::OnceCell<LiquidValue>);
|
||||
|
||||
impl ParsedValue {
|
||||
fn empty() -> ParsedValue {
|
||||
ParsedValue(OnceCell::new())
|
||||
}
|
||||
|
||||
fn get(&self, raw: &[u8]) -> &LiquidValue {
|
||||
self.0.get_or_init(|| {
|
||||
let value: serde_json::Value = serde_json::from_slice(raw).unwrap();
|
||||
liquid::model::to_value(&value).unwrap()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Document<'a> {
|
||||
pub fn new(
|
||||
data: obkv::KvReaderU16<'a>,
|
||||
side: DelAdd,
|
||||
inverted_field_map: &'a FieldsIdsMap,
|
||||
) -> Self {
|
||||
let mut out_data = BTreeMap::new();
|
||||
for (fid, raw) in data {
|
||||
let obkv = KvReaderDelAdd::new(raw);
|
||||
let Some(raw) = obkv.get(side) else {
|
||||
continue;
|
||||
};
|
||||
let Some(name) = inverted_field_map.name(fid) else {
|
||||
continue;
|
||||
};
|
||||
out_data.insert(name, (raw, ParsedValue::empty()));
|
||||
}
|
||||
Self(out_data)
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
fn iter(&self) -> impl Iterator<Item = (KString, LiquidValue)> + '_ {
|
||||
self.0.iter().map(|(&k, (raw, data))| (k.to_owned().into(), data.get(raw).to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for Document<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
self.len() as i64
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
let keys = BTreeMap::keys(&self.0).map(|&s| s.into());
|
||||
Box::new(keys)
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(self.0.values().map(|(raw, v)| v.get(raw) as &dyn ValueView))
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.0.iter().map(|(&k, (raw, data))| (k.into(), data.get(raw) as &dyn ValueView)))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
self.0.contains_key(index)
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
self.0.get(index).map(|(raw, v)| v.get(raw) as &dyn ValueView)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Document<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(self.iter().collect())
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
56
milli/src/prompt/error.rs
Normal file
56
milli/src/prompt/error.rs
Normal file
|
@ -0,0 +1,56 @@
|
|||
use crate::error::FaultSource;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct NewPromptError {
|
||||
pub kind: NewPromptErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
impl From<NewPromptError> for crate::Error {
|
||||
fn from(value: NewPromptError) -> Self {
|
||||
crate::Error::UserError(crate::UserError::InvalidPrompt(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl NewPromptError {
|
||||
pub(crate) fn cannot_parse_template(inner: liquid::Error) -> NewPromptError {
|
||||
Self { kind: NewPromptErrorKind::CannotParseTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn invalid_fields_in_template(inner: liquid::Error) -> NewPromptError {
|
||||
Self { kind: NewPromptErrorKind::InvalidFieldsInTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NewPromptErrorKind {
|
||||
#[error("cannot parse template: {0}")]
|
||||
CannotParseTemplate(liquid::Error),
|
||||
#[error("template contains invalid fields: {0}. Only `doc.*`, `fields[i].name`, `fields[i].value` are supported")]
|
||||
InvalidFieldsInTemplate(liquid::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct RenderPromptError {
|
||||
pub kind: RenderPromptErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
impl RenderPromptError {
|
||||
pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError {
|
||||
Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RenderPromptErrorKind {
|
||||
#[error("missing field in document: {0}")]
|
||||
MissingContext(liquid::Error),
|
||||
}
|
||||
|
||||
impl From<RenderPromptError> for crate::Error {
|
||||
fn from(value: RenderPromptError) -> Self {
|
||||
crate::Error::UserError(crate::UserError::MissingDocumentField(value))
|
||||
}
|
||||
}
|
172
milli/src/prompt/fields.rs
Normal file
172
milli/src/prompt/fields.rs
Normal file
|
@ -0,0 +1,172 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use super::document::Document;
|
||||
use crate::FieldsIdsMap;
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Fields<'a>(Vec<FieldValue<'a>>);
|
||||
|
||||
impl<'a> Fields<'a> {
|
||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
|
||||
Self(
|
||||
std::iter::repeat(document)
|
||||
.zip(field_id_map.iter())
|
||||
.map(|(document, (_fid, name))| FieldValue { document, name })
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct FieldValue<'a> {
|
||||
name: &'a str,
|
||||
document: &'a Document<'a>,
|
||||
}
|
||||
|
||||
impl<'a> ValueView for FieldValue<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, v)| (k.to_string().into(), v.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FieldValue<'a> {
|
||||
pub fn name(&self) -> &&'a str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &dyn ValueView {
|
||||
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.size() == 0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for FieldValue<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["name", "value"].iter().map(|&x| KStringCow::from_static(x)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(self.name() as &dyn ValueView).chain(std::iter::once(self.value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "name" || index == "value"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"name" => Some(self.name()),
|
||||
"value" => Some(self.value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ArrayView for Fields<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self.0.as_value()
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
self.0.len() as i64
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
self.0.values()
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: i64) -> bool {
|
||||
self.0.contains_key(index)
|
||||
}
|
||||
|
||||
fn get(&self, index: i64) -> Option<&dyn ValueView> {
|
||||
ArrayView::get(&self.0, index)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Fields<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
self.0.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
self.0.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
self.0.type_name()
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
self.0.query_state(state)
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
self.0.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
self.0.to_value()
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn ArrayView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
144
milli/src/prompt/mod.rs
Normal file
144
milli/src/prompt/mod.rs
Normal file
|
@ -0,0 +1,144 @@
|
|||
mod context;
|
||||
mod document;
|
||||
pub(crate) mod error;
|
||||
mod fields;
|
||||
mod template_checker;
|
||||
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use error::{NewPromptError, RenderPromptError};
|
||||
|
||||
use self::context::Context;
|
||||
use self::document::Document;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
pub struct Prompt {
|
||||
template: liquid::Template,
|
||||
template_text: String,
|
||||
strategy: PromptFallbackStrategy,
|
||||
fallback: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PromptData {
|
||||
pub template: String,
|
||||
pub strategy: PromptFallbackStrategy,
|
||||
pub fallback: String,
|
||||
}
|
||||
|
||||
impl From<Prompt> for PromptData {
|
||||
fn from(value: Prompt) -> Self {
|
||||
Self { template: value.template_text, strategy: value.strategy, fallback: value.fallback }
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<PromptData> for Prompt {
|
||||
type Error = NewPromptError;
|
||||
|
||||
fn try_from(value: PromptData) -> Result<Self, Self::Error> {
|
||||
Prompt::new(value.template, Some(value.strategy), Some(value.fallback))
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Prompt {
|
||||
fn clone(&self) -> Self {
|
||||
let template_text = self.template_text.clone();
|
||||
Self {
|
||||
template: new_template(&template_text).unwrap(),
|
||||
template_text,
|
||||
strategy: self.strategy,
|
||||
fallback: self.fallback.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_template(text: &str) -> Result<liquid::Template, liquid::Error> {
|
||||
liquid::ParserBuilder::with_stdlib().build().unwrap().parse(text)
|
||||
}
|
||||
|
||||
fn default_template() -> liquid::Template {
|
||||
new_template(default_template_text()).unwrap()
|
||||
}
|
||||
|
||||
fn default_template_text() -> &'static str {
|
||||
"{% for field in fields %} \
|
||||
{{ field.name }}: {{ field.value }}\n\
|
||||
{% endfor %}"
|
||||
}
|
||||
|
||||
fn default_fallback() -> &'static str {
|
||||
"<MISSING>"
|
||||
}
|
||||
|
||||
impl Default for Prompt {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
template: default_template(),
|
||||
template_text: default_template_text().into(),
|
||||
strategy: Default::default(),
|
||||
fallback: default_fallback().into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PromptData {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
template: default_template_text().into(),
|
||||
strategy: Default::default(),
|
||||
fallback: default_fallback().into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Prompt {
|
||||
pub fn new(
|
||||
template: String,
|
||||
strategy: Option<PromptFallbackStrategy>,
|
||||
fallback: Option<String>,
|
||||
) -> Result<Self, NewPromptError> {
|
||||
let this = Self {
|
||||
template: liquid::ParserBuilder::with_stdlib()
|
||||
.build()
|
||||
.unwrap()
|
||||
.parse(&template)
|
||||
.map_err(NewPromptError::cannot_parse_template)?,
|
||||
template_text: template,
|
||||
strategy: strategy.unwrap_or_default(),
|
||||
fallback: fallback.unwrap_or_default(),
|
||||
};
|
||||
|
||||
// render template with special object that's OK with `doc.*` and `fields.*`
|
||||
/// FIXME: doesn't work for nested objects e.g. `doc.a.b`
|
||||
this.template
|
||||
.render(&template_checker::TemplateChecker)
|
||||
.map_err(NewPromptError::invalid_fields_in_template)?;
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn render(
|
||||
&self,
|
||||
document: obkv::KvReaderU16<'_>,
|
||||
side: DelAdd,
|
||||
field_id_map: &FieldsIdsMap,
|
||||
) -> Result<String, RenderPromptError> {
|
||||
let document = Document::new(document, side, field_id_map);
|
||||
let context = Context::new(&document, field_id_map);
|
||||
|
||||
self.template.render(&context).map_err(RenderPromptError::missing_context)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug, Default, Clone, PartialEq, Eq, Copy, serde::Serialize, serde::Deserialize, deserr::Deserr,
|
||||
)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum PromptFallbackStrategy {
|
||||
Fallback,
|
||||
Skip,
|
||||
#[default]
|
||||
Error,
|
||||
}
|
282
milli/src/prompt/template_checker.rs
Normal file
282
milli/src/prompt/template_checker.rs
Normal file
|
@ -0,0 +1,282 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TemplateChecker;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyDoc;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyFields;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyField;
|
||||
|
||||
const DUMMY_VALUE: &LiquidValue = &LiquidValue::Nil;
|
||||
|
||||
impl ObjectView for DummyField {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["name", "value"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "name" || index == "value"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
if self.contains_key(index) {
|
||||
Some(DUMMY_VALUE.as_view())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyField {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
DUMMY_VALUE.query_state(state)
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Nil
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyFields {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"array"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
DUMMY_VALUE.query_state(state)
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Nil
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn ArrayView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ArrayView for DummyFields {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
i64::MAX
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn contains_key(&self, _index: i64) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn get(&self, _index: i64) -> Option<&dyn ValueView> {
|
||||
Some(DummyField.as_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for DummyDoc {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
1000
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn contains_key(&self, _index: &str) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, _index: &str) -> Option<&'s dyn ValueView> {
|
||||
Some(DUMMY_VALUE.as_view())
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyDoc {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
DUMMY_VALUE.query_state(state)
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Nil
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for TemplateChecker {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(DummyDoc.as_value()).chain(std::iter::once(DummyFields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(DummyDoc.as_value()),
|
||||
"fields" => Some(DummyFields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for TemplateChecker {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
|
@ -1,3 +1,6 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::distance_between_two_points;
|
||||
|
@ -12,9 +15,24 @@ pub enum ScoreDetails {
|
|||
ExactAttribute(ExactAttribute),
|
||||
ExactWords(ExactWords),
|
||||
Sort(Sort),
|
||||
Vector(Vector),
|
||||
GeoSort(GeoSort),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ScoreValue<'a> {
|
||||
Score(f64),
|
||||
Sort(&'a Sort),
|
||||
GeoSort(&'a GeoSort),
|
||||
}
|
||||
|
||||
enum RankOrValue<'a> {
|
||||
Rank(Rank),
|
||||
Sort(&'a Sort),
|
||||
GeoSort(&'a GeoSort),
|
||||
Score(f64),
|
||||
}
|
||||
|
||||
impl ScoreDetails {
|
||||
pub fn local_score(&self) -> Option<f64> {
|
||||
self.rank().map(Rank::local_score)
|
||||
|
@ -31,11 +49,55 @@ impl ScoreDetails {
|
|||
ScoreDetails::ExactWords(details) => Some(details.rank()),
|
||||
ScoreDetails::Sort(_) => None,
|
||||
ScoreDetails::GeoSort(_) => None,
|
||||
ScoreDetails::Vector(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn global_score<'a>(details: impl Iterator<Item = &'a Self>) -> f64 {
|
||||
Rank::global_score(details.filter_map(Self::rank))
|
||||
pub fn global_score<'a>(details: impl Iterator<Item = &'a Self> + 'a) -> f64 {
|
||||
Self::score_values(details)
|
||||
.find_map(|x| {
|
||||
let ScoreValue::Score(score) = x else {
|
||||
return None;
|
||||
};
|
||||
Some(score)
|
||||
})
|
||||
.unwrap_or(1.0f64)
|
||||
}
|
||||
|
||||
pub fn score_values<'a>(
|
||||
details: impl Iterator<Item = &'a Self> + 'a,
|
||||
) -> impl Iterator<Item = ScoreValue<'a>> + 'a {
|
||||
details
|
||||
.map(ScoreDetails::rank_or_value)
|
||||
.coalesce(|left, right| match (left, right) {
|
||||
(RankOrValue::Rank(left), RankOrValue::Rank(right)) => {
|
||||
Ok(RankOrValue::Rank(Rank::merge(left, right)))
|
||||
}
|
||||
(left, right) => Err((left, right)),
|
||||
})
|
||||
.map(|rank_or_value| match rank_or_value {
|
||||
RankOrValue::Rank(r) => ScoreValue::Score(r.local_score()),
|
||||
RankOrValue::Sort(s) => ScoreValue::Sort(s),
|
||||
RankOrValue::GeoSort(g) => ScoreValue::GeoSort(g),
|
||||
RankOrValue::Score(s) => ScoreValue::Score(s),
|
||||
})
|
||||
}
|
||||
|
||||
fn rank_or_value(&self) -> RankOrValue<'_> {
|
||||
match self {
|
||||
ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()),
|
||||
ScoreDetails::Typo(t) => RankOrValue::Rank(t.rank()),
|
||||
ScoreDetails::Proximity(p) => RankOrValue::Rank(*p),
|
||||
ScoreDetails::Fid(f) => RankOrValue::Rank(*f),
|
||||
ScoreDetails::Position(p) => RankOrValue::Rank(*p),
|
||||
ScoreDetails::ExactAttribute(e) => RankOrValue::Rank(e.rank()),
|
||||
ScoreDetails::ExactWords(e) => RankOrValue::Rank(e.rank()),
|
||||
ScoreDetails::Sort(sort) => RankOrValue::Sort(sort),
|
||||
ScoreDetails::GeoSort(geosort) => RankOrValue::GeoSort(geosort),
|
||||
ScoreDetails::Vector(vector) => RankOrValue::Score(
|
||||
vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Panics
|
||||
|
@ -181,6 +243,19 @@ impl ScoreDetails {
|
|||
details_map.insert(sort, sort_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Vector(s) => {
|
||||
let vector = format!("vectorSort({:?})", s.target_vector);
|
||||
let value = s.value_similarity.as_ref().map(|(v, _)| v);
|
||||
let similarity = s.value_similarity.as_ref().map(|(_, s)| s);
|
||||
|
||||
let details = serde_json::json!({
|
||||
"order": order,
|
||||
"value": value,
|
||||
"similarity": similarity,
|
||||
});
|
||||
details_map.insert(vector, details);
|
||||
order += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
details_map
|
||||
|
@ -297,15 +372,21 @@ impl Rank {
|
|||
pub fn global_score(details: impl Iterator<Item = Self>) -> f64 {
|
||||
let mut rank = Rank { rank: 1, max_rank: 1 };
|
||||
for inner_rank in details {
|
||||
rank.rank -= 1;
|
||||
|
||||
rank.rank *= inner_rank.max_rank;
|
||||
rank.max_rank *= inner_rank.max_rank;
|
||||
|
||||
rank.rank += inner_rank.rank;
|
||||
rank = Rank::merge(rank, inner_rank);
|
||||
}
|
||||
rank.local_score()
|
||||
}
|
||||
|
||||
pub fn merge(mut outer: Rank, inner: Rank) -> Rank {
|
||||
outer.rank = outer.rank.saturating_sub(1);
|
||||
|
||||
outer.rank *= inner.max_rank;
|
||||
outer.max_rank *= inner.max_rank;
|
||||
|
||||
outer.rank += inner.rank;
|
||||
|
||||
outer
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
|
||||
|
@ -335,13 +416,78 @@ pub struct Sort {
|
|||
pub value: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||
impl PartialOrd for Sort {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
if self.field_name != other.field_name {
|
||||
return None;
|
||||
}
|
||||
if self.ascending != other.ascending {
|
||||
return None;
|
||||
}
|
||||
match (&self.value, &other.value) {
|
||||
(serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal),
|
||||
(serde_json::Value::Null, _) => Some(Ordering::Less),
|
||||
(_, serde_json::Value::Null) => Some(Ordering::Greater),
|
||||
// numbers are always before strings
|
||||
(serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater),
|
||||
(serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less),
|
||||
(serde_json::Value::Number(left), serde_json::Value::Number(right)) => {
|
||||
// FIXME: unwrap permitted here?
|
||||
let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?;
|
||||
// 12 < 42, and when ascending, we want to see 12 first, so the smallest.
|
||||
// Hence, when ascending, smaller is better
|
||||
Some(if self.ascending { order.reverse() } else { order })
|
||||
}
|
||||
(serde_json::Value::String(left), serde_json::Value::String(right)) => {
|
||||
let order = left.cmp(right);
|
||||
// Taking e.g. "a" and "z"
|
||||
// "a" < "z", and when ascending, we want to see "a" first, so the smallest.
|
||||
// Hence, when ascending, smaller is better
|
||||
Some(if self.ascending { order.reverse() } else { order })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub struct GeoSort {
|
||||
pub target_point: [f64; 2],
|
||||
pub ascending: bool,
|
||||
pub value: Option<[f64; 2]>,
|
||||
}
|
||||
|
||||
impl PartialOrd for GeoSort {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
if self.target_point != other.target_point {
|
||||
return None;
|
||||
}
|
||||
if self.ascending != other.ascending {
|
||||
return None;
|
||||
}
|
||||
Some(match (self.distance(), other.distance()) {
|
||||
(None, None) => Ordering::Equal,
|
||||
(None, Some(_)) => Ordering::Less,
|
||||
(Some(_), None) => Ordering::Greater,
|
||||
(Some(left), Some(right)) => {
|
||||
let order = left.partial_cmp(&right)?;
|
||||
if self.ascending {
|
||||
// when ascending, the one with the smallest distance has the best score
|
||||
order.reverse()
|
||||
} else {
|
||||
order
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
||||
pub struct Vector {
|
||||
pub target_vector: Vec<f32>,
|
||||
pub value_similarity: Option<(Vec<f32>, f32)>,
|
||||
}
|
||||
|
||||
impl GeoSort {
|
||||
pub fn distance(&self) -> Option<f64> {
|
||||
self.value.map(|value| distance_between_two_points(&self.target_point, &value))
|
||||
|
|
336
milli/src/search/hybrid.rs
Normal file
336
milli/src/search/hybrid.rs
Normal file
|
@ -0,0 +1,336 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use itertools::Itertools;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::new::{execute_vector_search, PartialSearchResult};
|
||||
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
|
||||
use crate::{
|
||||
execute_search, DefaultSearchLogger, MatchingWords, Result, Search, SearchContext, SearchResult,
|
||||
};
|
||||
|
||||
struct CombinedSearchResult {
|
||||
matching_words: MatchingWords,
|
||||
candidates: RoaringBitmap,
|
||||
document_scores: Vec<(u32, CombinedScore)>,
|
||||
}
|
||||
|
||||
type CombinedScore = (Vec<ScoreDetails>, Option<Vec<ScoreDetails>>);
|
||||
|
||||
fn compare_scores(left: &CombinedScore, right: &CombinedScore) -> Ordering {
|
||||
let mut left_main_it = ScoreDetails::score_values(left.0.iter());
|
||||
let mut left_sub_it =
|
||||
ScoreDetails::score_values(left.1.as_ref().map(|x| x.iter()).into_iter().flatten());
|
||||
|
||||
let mut right_main_it = ScoreDetails::score_values(right.0.iter());
|
||||
let mut right_sub_it =
|
||||
ScoreDetails::score_values(right.1.as_ref().map(|x| x.iter()).into_iter().flatten());
|
||||
|
||||
let mut left_main = left_main_it.next();
|
||||
let mut left_sub = left_sub_it.next();
|
||||
let mut right_main = right_main_it.next();
|
||||
let mut right_sub = right_sub_it.next();
|
||||
|
||||
loop {
|
||||
let left =
|
||||
take_best_score(&mut left_main, &mut left_sub, &mut left_main_it, &mut left_sub_it);
|
||||
|
||||
let right =
|
||||
take_best_score(&mut right_main, &mut right_sub, &mut right_main_it, &mut right_sub_it);
|
||||
|
||||
match (left, right) {
|
||||
(None, None) => return Ordering::Equal,
|
||||
(None, Some(_)) => return Ordering::Less,
|
||||
(Some(_), None) => return Ordering::Greater,
|
||||
(Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => {
|
||||
if (left - right).abs() <= f64::EPSILON {
|
||||
continue;
|
||||
}
|
||||
return left.partial_cmp(&right).unwrap();
|
||||
}
|
||||
(Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => {
|
||||
match left.partial_cmp(right).unwrap() {
|
||||
Ordering::Equal => continue,
|
||||
order => return order,
|
||||
}
|
||||
}
|
||||
(Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => {
|
||||
match left.partial_cmp(right).unwrap() {
|
||||
Ordering::Equal => continue,
|
||||
order => return order,
|
||||
}
|
||||
}
|
||||
(Some(ScoreValue::Score(_)), Some(_)) => return Ordering::Greater,
|
||||
(Some(_), Some(ScoreValue::Score(_))) => return Ordering::Less,
|
||||
// if we have this, we're bad
|
||||
(Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_)))
|
||||
| (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => {
|
||||
unreachable!("Unexpected geo and sort comparison")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn take_best_score<'a>(
|
||||
main_score: &mut Option<ScoreValue<'a>>,
|
||||
sub_score: &mut Option<ScoreValue<'a>>,
|
||||
main_it: &mut impl Iterator<Item = ScoreValue<'a>>,
|
||||
sub_it: &mut impl Iterator<Item = ScoreValue<'a>>,
|
||||
) -> Option<ScoreValue<'a>> {
|
||||
match (*main_score, *sub_score) {
|
||||
(Some(main), None) => {
|
||||
*main_score = main_it.next();
|
||||
Some(main)
|
||||
}
|
||||
(None, Some(sub)) => {
|
||||
*sub_score = sub_it.next();
|
||||
Some(sub)
|
||||
}
|
||||
(main @ Some(ScoreValue::Score(main_f)), sub @ Some(ScoreValue::Score(sub_v))) => {
|
||||
// take max, both advance
|
||||
*main_score = main_it.next();
|
||||
*sub_score = sub_it.next();
|
||||
if main_f >= sub_v {
|
||||
main
|
||||
} else {
|
||||
sub
|
||||
}
|
||||
}
|
||||
(main @ Some(ScoreValue::Score(_)), _) => {
|
||||
*main_score = main_it.next();
|
||||
main
|
||||
}
|
||||
(_, sub @ Some(ScoreValue::Score(_))) => {
|
||||
*sub_score = sub_it.next();
|
||||
sub
|
||||
}
|
||||
(main @ Some(ScoreValue::GeoSort(main_geo)), sub @ Some(ScoreValue::GeoSort(sub_geo))) => {
|
||||
// take best advance both
|
||||
*main_score = main_it.next();
|
||||
*sub_score = sub_it.next();
|
||||
if main_geo >= sub_geo {
|
||||
main
|
||||
} else {
|
||||
sub
|
||||
}
|
||||
}
|
||||
(main @ Some(ScoreValue::Sort(main_sort)), sub @ Some(ScoreValue::Sort(sub_sort))) => {
|
||||
// take best advance both
|
||||
*main_score = main_it.next();
|
||||
*sub_score = sub_it.next();
|
||||
if main_sort >= sub_sort {
|
||||
main
|
||||
} else {
|
||||
sub
|
||||
}
|
||||
}
|
||||
(
|
||||
Some(ScoreValue::GeoSort(_) | ScoreValue::Sort(_)),
|
||||
Some(ScoreValue::GeoSort(_) | ScoreValue::Sort(_)),
|
||||
) => None,
|
||||
|
||||
(None, None) => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl CombinedSearchResult {
|
||||
fn new(main_results: SearchResult, ancillary_results: PartialSearchResult) -> Self {
|
||||
let mut docid_scores = HashMap::new();
|
||||
for (docid, score) in
|
||||
main_results.documents_ids.iter().zip(main_results.document_scores.into_iter())
|
||||
{
|
||||
docid_scores.insert(*docid, (score, None));
|
||||
}
|
||||
|
||||
for (docid, score) in ancillary_results
|
||||
.documents_ids
|
||||
.iter()
|
||||
.zip(ancillary_results.document_scores.into_iter())
|
||||
{
|
||||
docid_scores
|
||||
.entry(*docid)
|
||||
.and_modify(|(_main_score, ancillary_score)| *ancillary_score = Some(score));
|
||||
}
|
||||
|
||||
let mut document_scores: Vec<_> = docid_scores.into_iter().collect();
|
||||
|
||||
document_scores.sort_by(|(_, left), (_, right)| compare_scores(left, right).reverse());
|
||||
|
||||
Self {
|
||||
matching_words: main_results.matching_words,
|
||||
candidates: main_results.candidates,
|
||||
document_scores,
|
||||
}
|
||||
}
|
||||
|
||||
fn merge(left: Self, right: Self, from: usize, length: usize) -> SearchResult {
|
||||
let mut documents_ids =
|
||||
Vec::with_capacity(left.document_scores.len() + right.document_scores.len());
|
||||
let mut document_scores =
|
||||
Vec::with_capacity(left.document_scores.len() + right.document_scores.len());
|
||||
|
||||
let mut documents_seen = RoaringBitmap::new();
|
||||
for (docid, (main_score, _sub_score)) in left
|
||||
.document_scores
|
||||
.into_iter()
|
||||
.merge_by(right.document_scores.into_iter(), |(_, left), (_, right)| {
|
||||
// the first value is the one with the greatest score
|
||||
compare_scores(left, right).is_ge()
|
||||
})
|
||||
// remove documents we already saw
|
||||
.filter(|(docid, _)| documents_seen.insert(*docid))
|
||||
// start skipping **after** the filter
|
||||
.skip(from)
|
||||
// take **after** skipping
|
||||
.take(length)
|
||||
{
|
||||
documents_ids.push(docid);
|
||||
// TODO: pass both scores to documents_score in some way?
|
||||
document_scores.push(main_score);
|
||||
}
|
||||
|
||||
SearchResult {
|
||||
matching_words: left.matching_words,
|
||||
candidates: left.candidates | right.candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
pub fn execute_hybrid(&self) -> Result<SearchResult> {
|
||||
// TODO: find classier way to achieve that than to reset vector and query params
|
||||
// create separate keyword and semantic searches
|
||||
let mut search = Search {
|
||||
query: self.query.clone(),
|
||||
vector: self.vector.clone(),
|
||||
filter: self.filter.clone(),
|
||||
offset: 0,
|
||||
limit: self.limit + self.offset,
|
||||
sort_criteria: self.sort_criteria.clone(),
|
||||
searchable_attributes: self.searchable_attributes,
|
||||
geo_strategy: self.geo_strategy,
|
||||
terms_matching_strategy: self.terms_matching_strategy,
|
||||
scoring_strategy: ScoringStrategy::Detailed,
|
||||
words_limit: self.words_limit,
|
||||
exhaustive_number_hits: self.exhaustive_number_hits,
|
||||
rtxn: self.rtxn,
|
||||
index: self.index,
|
||||
};
|
||||
|
||||
let vector_query = search.vector.take();
|
||||
let keyword_query = self.query.as_deref();
|
||||
|
||||
let keyword_results = search.execute()?;
|
||||
|
||||
// skip semantic search if we don't have a vector query (placeholder search)
|
||||
let Some(vector_query) = vector_query else {
|
||||
return Ok(keyword_results);
|
||||
};
|
||||
|
||||
// completely skip semantic search if the results of the keyword search are good enough
|
||||
if self.results_good_enough(&keyword_results) {
|
||||
return Ok(keyword_results);
|
||||
}
|
||||
|
||||
search.vector = Some(vector_query);
|
||||
search.query = None;
|
||||
|
||||
// TODO: would be better to have two distinct functions at this point
|
||||
let vector_results = search.execute()?;
|
||||
|
||||
// Compute keyword scores for vector_results
|
||||
let keyword_results_for_vector =
|
||||
self.keyword_results_for_vector(keyword_query, &vector_results)?;
|
||||
|
||||
// compute vector scores for keyword_results
|
||||
let vector_results_for_keyword =
|
||||
// can unwrap because we returned already if there was no vector query
|
||||
self.vector_results_for_keyword(search.vector.as_ref().unwrap(), &keyword_results)?;
|
||||
|
||||
let keyword_results =
|
||||
CombinedSearchResult::new(keyword_results, vector_results_for_keyword);
|
||||
let vector_results = CombinedSearchResult::new(vector_results, keyword_results_for_vector);
|
||||
|
||||
let merge_results =
|
||||
CombinedSearchResult::merge(vector_results, keyword_results, self.offset, self.limit);
|
||||
assert!(merge_results.documents_ids.len() <= self.limit);
|
||||
Ok(merge_results)
|
||||
}
|
||||
|
||||
fn vector_results_for_keyword(
|
||||
&self,
|
||||
vector: &[f32],
|
||||
keyword_results: &SearchResult,
|
||||
) -> Result<PartialSearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
|
||||
if let Some(searchable_attributes) = self.searchable_attributes {
|
||||
ctx.searchable_attributes(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let universe = keyword_results.documents_ids.iter().collect();
|
||||
|
||||
execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
ScoringStrategy::Detailed,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
0,
|
||||
self.limit + self.offset,
|
||||
)
|
||||
}
|
||||
|
||||
fn keyword_results_for_vector(
|
||||
&self,
|
||||
query: Option<&str>,
|
||||
vector_results: &SearchResult,
|
||||
) -> Result<PartialSearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
|
||||
if let Some(searchable_attributes) = self.searchable_attributes {
|
||||
ctx.searchable_attributes(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let universe = vector_results.documents_ids.iter().collect();
|
||||
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
query,
|
||||
self.terms_matching_strategy,
|
||||
ScoringStrategy::Detailed,
|
||||
self.exhaustive_number_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
0,
|
||||
self.limit + self.offset,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)
|
||||
}
|
||||
|
||||
fn results_good_enough(&self, keyword_results: &SearchResult) -> bool {
|
||||
const GOOD_ENOUGH_SCORE: f64 = 0.9;
|
||||
|
||||
// 1. we check that we got a sufficient number of results
|
||||
if keyword_results.document_scores.len() < self.limit + self.offset {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2. and that all results have a good enough score.
|
||||
// we need to check all results because due to sort like rules, they're not necessarily in relevancy order
|
||||
for score in &keyword_results.document_scores {
|
||||
let score = ScoreDetails::global_score(score.iter());
|
||||
if score < GOOD_ENOUGH_SCORE {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
|
@ -3,6 +3,7 @@ use std::ops::ControlFlow;
|
|||
|
||||
use charabia::normalizer::NormalizerOption;
|
||||
use charabia::Normalize;
|
||||
use deserr::{DeserializeError, Deserr, Sequence};
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
|
@ -12,12 +13,13 @@ use roaring::bitmap::RoaringBitmap;
|
|||
|
||||
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
|
||||
use self::new::PartialSearchResult;
|
||||
use self::new::{execute_vector_search, PartialSearchResult};
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::{
|
||||
execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result, SearchContext,
|
||||
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index,
|
||||
Result, SearchContext,
|
||||
};
|
||||
|
||||
// Building these factories is not free.
|
||||
|
@ -30,6 +32,7 @@ const MAX_NUMBER_OF_FACETS: usize = 100;
|
|||
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
pub mod hybrid;
|
||||
pub mod new;
|
||||
|
||||
pub struct Search<'a> {
|
||||
|
@ -50,6 +53,53 @@ pub struct Search<'a> {
|
|||
index: &'a Index,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum VectorQuery {
|
||||
Vector(Vec<f32>),
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl<E> Deserr<E> for VectorQuery
|
||||
where
|
||||
E: DeserializeError,
|
||||
{
|
||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||
value: deserr::Value<V>,
|
||||
location: deserr::ValuePointerRef,
|
||||
) -> std::result::Result<Self, E> {
|
||||
match value {
|
||||
deserr::Value::String(s) => Ok(VectorQuery::String(s)),
|
||||
deserr::Value::Sequence(seq) => {
|
||||
let v: std::result::Result<Vec<f32>, _> = seq
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(index, v)| match v.into_value() {
|
||||
deserr::Value::Float(f) => Ok(f as f32),
|
||||
deserr::Value::Integer(i) => Ok(i as f32),
|
||||
v => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::IncorrectValueKind {
|
||||
actual: v,
|
||||
accepted: &[deserr::ValueKind::Float, deserr::ValueKind::Integer],
|
||||
},
|
||||
location.push_index(index),
|
||||
))),
|
||||
})
|
||||
.collect();
|
||||
Ok(VectorQuery::Vector(v?))
|
||||
}
|
||||
_ => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::IncorrectValueKind {
|
||||
actual: value,
|
||||
accepted: &[deserr::ValueKind::String, deserr::ValueKind::Sequence],
|
||||
},
|
||||
location,
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
|
||||
Search {
|
||||
|
@ -75,8 +125,8 @@ impl<'a> Search<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn vector(&mut self, vector: impl Into<Vec<f32>>) -> &mut Search<'a> {
|
||||
self.vector = Some(vector.into());
|
||||
pub fn vector(&mut self, vector: Vec<f32>) -> &mut Search<'a> {
|
||||
self.vector = Some(vector);
|
||||
self
|
||||
}
|
||||
|
||||
|
@ -140,23 +190,35 @@ impl<'a> Search<'a> {
|
|||
ctx.searchable_attributes(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let universe = filtered_universe(&ctx, &self.filter)?;
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids, document_scores } =
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
&self.query,
|
||||
&self.vector,
|
||||
self.terms_matching_strategy,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
&self.filter,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)?;
|
||||
match self.vector.as_ref() {
|
||||
Some(vector) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
)?,
|
||||
None => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
self.terms_matching_strategy,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)?,
|
||||
};
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
|
|
|
@ -498,19 +498,19 @@ mod tests {
|
|||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, SearchContext};
|
||||
use crate::{execute_search, filtered_universe, SearchContext};
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn);
|
||||
let universe = filtered_universe(&ctx, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
&Some(query.to_string()),
|
||||
&None,
|
||||
Some(query),
|
||||
crate::TermsMatchingStrategy::default(),
|
||||
crate::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
&None,
|
||||
universe,
|
||||
&None,
|
||||
crate::search::new::GeoSortStrategy::default(),
|
||||
0,
|
||||
|
|
|
@ -16,6 +16,7 @@ mod small_bitmap;
|
|||
|
||||
mod exact_attribute;
|
||||
mod sort;
|
||||
mod vector_sort;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
@ -28,7 +29,6 @@ use db_cache::DatabaseCache;
|
|||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use instant_distance::Search;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
|
@ -46,7 +46,7 @@ use self::geo_sort::GeoSort;
|
|||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use crate::distance::NDotProductPoint;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
|
@ -258,6 +258,70 @@ fn get_ranking_rules_for_placeholder_search<'ctx>(
|
|||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
fn get_ranking_rules_for_vector<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
target: &[f32],
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
// query graph search
|
||||
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
let mut vector = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<PlaceholderQuery>> = vec![];
|
||||
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Exactness => {
|
||||
if !vector {
|
||||
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
|
||||
let vector_sort = VectorSort::new(ctx, target.to_vec(), vector_candidates)?;
|
||||
ranking_rules.push(Box::new(vector_sort));
|
||||
vector = true;
|
||||
}
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a query graph search.
|
||||
fn get_ranking_rules_for_query_graph_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
|
@ -422,15 +486,62 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn filtered_universe(ctx: &SearchContext, filters: &Option<Filter>) -> Result<RoaringBitmap> {
|
||||
Ok(if let Some(filters) = filters {
|
||||
filters.evaluate(ctx.txn, ctx.index)?
|
||||
} else {
|
||||
ctx.index.documents_ids(ctx.txn)?
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn execute_vector_search(
|
||||
ctx: &mut SearchContext,
|
||||
vector: &[f32],
|
||||
scoring_strategy: ScoringStrategy,
|
||||
universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
/// FIXME: input universe = universe & documents_with_vectors
|
||||
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
|
||||
let ranking_rules = get_ranking_rules_for_vector(ctx, sort_criteria, geo_strategy, vector)?;
|
||||
|
||||
let mut placeholder_search_logger = logger::DefaultSearchLogger;
|
||||
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
||||
&mut placeholder_search_logger;
|
||||
|
||||
let BucketSortOutput { docids, scores, all_candidates } = bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
)?;
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms: None,
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: &Option<String>,
|
||||
vector: &Option<Vec<f32>>,
|
||||
query: Option<&str>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
exhaustive_number_hits: bool,
|
||||
filters: &Option<Filter>,
|
||||
mut universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
|
@ -439,60 +550,8 @@ pub fn execute_search(
|
|||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
let mut universe = if let Some(filters) = filters {
|
||||
filters.evaluate(ctx.txn, ctx.index)?
|
||||
} else {
|
||||
ctx.index.documents_ids(ctx.txn)?
|
||||
};
|
||||
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
if let Some(vector) = vector {
|
||||
let mut search = Search::default();
|
||||
let docids = match ctx.index.vector_hnsw(ctx.txn)? {
|
||||
Some(hnsw) => {
|
||||
if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() {
|
||||
if vector.len() != expected_size {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_size,
|
||||
found: vector.len(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
let vector = NDotProductPoint::new(vector.clone());
|
||||
|
||||
let neighbors = hnsw.search(&vector, &mut search);
|
||||
|
||||
let mut docids = Vec::new();
|
||||
let mut uniq_docids = RoaringBitmap::new();
|
||||
for instant_distance::Item { distance: _, pid, point: _ } in neighbors {
|
||||
let index = pid.into_inner();
|
||||
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap();
|
||||
if universe.contains(docid) && uniq_docids.insert(docid) {
|
||||
docids.push(docid);
|
||||
if docids.len() == (from + length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// return the nearest documents that are also part of the candidates
|
||||
// along with a dummy list of scores that are useless in this context.
|
||||
docids.into_iter().skip(from).take(length).collect()
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
|
||||
return Ok(PartialSearchResult {
|
||||
candidates: universe,
|
||||
document_scores: vec![Vec::new(); docids.len()],
|
||||
documents_ids: docids,
|
||||
located_query_terms: None,
|
||||
});
|
||||
}
|
||||
|
||||
let mut located_query_terms = None;
|
||||
let query_terms = if let Some(query) = query {
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
|
@ -546,7 +605,7 @@ pub fn execute_search(
|
|||
terms_matching_strategy,
|
||||
)?;
|
||||
|
||||
universe =
|
||||
universe &=
|
||||
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
|
||||
|
||||
bucket_sort(
|
||||
|
|
150
milli/src/search/new/vector_sort.rs
Normal file
150
milli/src/search/new/vector_sort.rs
Normal file
|
@ -0,0 +1,150 @@
|
|||
use std::future::Future;
|
||||
use std::iter::FromIterator;
|
||||
use std::pin::Pin;
|
||||
|
||||
use nolife::DynBoxScope;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::distance::NDotProductPoint;
|
||||
use crate::index::Hnsw;
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::{Result, SearchContext, SearchLogger, UserError};
|
||||
|
||||
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
target: Vec<f32>,
|
||||
vector_candidates: RoaringBitmap,
|
||||
scope: nolife::DynBoxScope<SearchFamily>,
|
||||
}
|
||||
|
||||
type Item<'a> = instant_distance::Item<'a, NDotProductPoint>;
|
||||
type SearchFut = Pin<Box<dyn Future<Output = nolife::Never>>>;
|
||||
|
||||
struct SearchFamily;
|
||||
impl<'a> nolife::Family<'a> for SearchFamily {
|
||||
type Family = Box<dyn Iterator<Item = Item<'a>> + 'a>;
|
||||
}
|
||||
|
||||
async fn search_scope(
|
||||
mut time_capsule: nolife::TimeCapsule<SearchFamily>,
|
||||
hnsw: Hnsw,
|
||||
target: Vec<f32>,
|
||||
) -> nolife::Never {
|
||||
let mut search = instant_distance::Search::default();
|
||||
let it = Box::new(hnsw.search(&NDotProductPoint::new(target), &mut search));
|
||||
let mut it: Box<dyn Iterator<Item = Item>> = it;
|
||||
loop {
|
||||
time_capsule.freeze(&mut it).await;
|
||||
}
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
||||
pub fn new(
|
||||
ctx: &SearchContext,
|
||||
target: Vec<f32>,
|
||||
vector_candidates: RoaringBitmap,
|
||||
) -> Result<Self> {
|
||||
let hnsw =
|
||||
ctx.index.vector_hnsw(ctx.txn)?.unwrap_or(Hnsw::builder().build_hnsw(Vec::default()).0);
|
||||
|
||||
if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() {
|
||||
if target.len() != expected_size {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_size,
|
||||
found: target.len(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
let target_clone = target.clone();
|
||||
let producer = move |time_capsule| -> SearchFut {
|
||||
Box::pin(search_scope(time_capsule, hnsw, target_clone))
|
||||
};
|
||||
let scope = DynBoxScope::new(producer);
|
||||
|
||||
Ok(Self { query: None, target, vector_candidates, scope })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
|
||||
fn id(&self) -> String {
|
||||
"vector_sort".to_owned()
|
||||
}
|
||||
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Q,
|
||||
) -> Result<()> {
|
||||
assert!(self.query.is_none());
|
||||
|
||||
self.query = Some(query.clone());
|
||||
self.vector_candidates &= universe;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Q>>> {
|
||||
let query = self.query.as_ref().unwrap().clone();
|
||||
self.vector_candidates &= universe;
|
||||
|
||||
if self.vector_candidates.is_empty() {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::Vector(score_details::Vector {
|
||||
target_vector: self.target.clone(),
|
||||
value_similarity: None,
|
||||
}),
|
||||
}));
|
||||
}
|
||||
|
||||
let scope = &mut self.scope;
|
||||
let target = &self.target;
|
||||
let vector_candidates = &self.vector_candidates;
|
||||
|
||||
scope.enter(|it| {
|
||||
for item in it.by_ref() {
|
||||
let item: Item = item;
|
||||
let index = item.pid.into_inner();
|
||||
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap();
|
||||
|
||||
if vector_candidates.contains(docid) {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: RoaringBitmap::from_iter([docid]),
|
||||
score: ScoreDetails::Vector(score_details::Vector {
|
||||
target_vector: target.clone(),
|
||||
value_similarity: Some((
|
||||
item.point.clone().into_inner(),
|
||||
1.0 - item.distance,
|
||||
)),
|
||||
}),
|
||||
}));
|
||||
}
|
||||
}
|
||||
Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::Vector(score_details::Vector {
|
||||
target_vector: target.clone(),
|
||||
value_similarity: None,
|
||||
}),
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||
self.query = None;
|
||||
}
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::convert::TryFrom;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
|
@ -13,13 +14,56 @@ use serde_json::{from_slice, Value};
|
|||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
pub struct ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
pub manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> ()
|
||||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
NoChange,
|
||||
// Remove all vectors, generated or manual, from this document
|
||||
NowRemoved,
|
||||
|
||||
// Add the manually specified vectors, passed in the other grenad
|
||||
// Remove any previously generated vectors
|
||||
// Note: changing the value of the manually specified vector **should not record** this delta
|
||||
WasGeneratedNowManual(Vec<Vec<f32>>),
|
||||
|
||||
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
|
||||
|
||||
// Add the vector computed from the specified prompt
|
||||
// Remove any previous vector
|
||||
// Note: changing the value of the prompt **does require** recording this delta
|
||||
NowGenerated(String),
|
||||
}
|
||||
|
||||
impl VectorStateDelta {
|
||||
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) {
|
||||
match self {
|
||||
VectorStateDelta::NoChange => Default::default(),
|
||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||
VectorStateDelta::WasGeneratedNowManual(add) => {
|
||||
(true, Default::default(), (Default::default(), add))
|
||||
}
|
||||
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
|
||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
|
@ -27,16 +71,34 @@ const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
|||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
vectors_fid: FieldId,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
field_id_map: FieldsIdsMap,
|
||||
prompt: Option<&Prompt>,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let mut manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let mut prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let mut remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let vectors_fid = field_id_map.id("_vectors");
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
|
@ -53,43 +115,148 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
if let Some(value) = obkv.get(vectors_fid) {
|
||||
let delta = if let Some(value) = vectors_fid.and_then(|vectors_fid| obkv.get(vectors_fid)) {
|
||||
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||
match (vectors_obkv.get(DelAdd::Deletion), vectors_obkv.get(DelAdd::Addition)) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = extract_vectors(old, document_id)?;
|
||||
let add_vectors = extract_vectors(new, document_id)?;
|
||||
|
||||
// then we extract the values
|
||||
let del_vectors = vectors_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add_vectors = vectors_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
VectorStateDelta::ManualDelta(
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = extract_vectors(new, document_id)?;
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut writer,
|
||||
&mut key_buffer,
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)?;
|
||||
}
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors.unwrap_or_default())
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
match prompt {
|
||||
Some(prompt) => VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
&field_id_map,
|
||||
)?),
|
||||
None => VectorStateDelta::NowRemoved,
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
match prompt {
|
||||
Some(prompt) => {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = prompt
|
||||
.render(obkv, DelAdd::Deletion, &field_id_map)
|
||||
.unwrap_or_default();
|
||||
let new_prompt =
|
||||
prompt.render(obkv, DelAdd::Addition, &field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
log::trace!(
|
||||
"Changing prompt from\n{old_prompt}\n===\nto\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
}
|
||||
// We no longer have a prompt, so we need to remove any existing vector
|
||||
None => VectorStateDelta::NowRemoved,
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
match prompt {
|
||||
Some(prompt) => {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = prompt
|
||||
.render(obkv, DelAdd::Deletion, &field_id_map)
|
||||
.unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, &field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
log::trace!(
|
||||
"Changing prompt from\n{old_prompt}\n===\nto\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
}
|
||||
None => VectorStateDelta::NowRemoved,
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
};
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut remove_vectors_writer,
|
||||
&mut prompts_writer,
|
||||
&mut manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
)?;
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
Ok(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn push_vectors_diff(
|
||||
writer: &mut Writer<BufWriter<File>>,
|
||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_vectors: Vec<Vec<f32>>,
|
||||
mut add_vectors: Vec<Vec<f32>>,
|
||||
delta: VectorStateDelta,
|
||||
) -> Result<()> {
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
if must_remove {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
}
|
||||
if !prompt.is_empty() {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
prompts_writer.insert(&key_buffer, prompt.as_bytes())?;
|
||||
}
|
||||
|
||||
// We sort and dedup the vectors
|
||||
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
|
@ -114,7 +281,7 @@ fn push_vectors_diff(
|
|||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
|
@ -122,7 +289,7 @@ fn push_vectors_diff(
|
|||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -146,3 +313,102 @@ fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Opti
|
|||
.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, Option<usize>)> {
|
||||
let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?;
|
||||
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parellelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
// docid, state with embedding
|
||||
let mut state_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut chunks = Vec::with_capacity(n_chunks);
|
||||
let mut current_chunk = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut current_chunk_ids = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut chunks_ids = Vec::with_capacity(n_chunks);
|
||||
let mut cursor = prompt_reader.into_cursor()?;
|
||||
|
||||
let mut expected_dimension = None;
|
||||
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// SAFETY: precondition, the grenad value was saved from a string
|
||||
let prompt = unsafe { std::str::from_utf8_unchecked(value) };
|
||||
if current_chunk.len() == current_chunk.capacity() {
|
||||
chunks.push(std::mem::replace(
|
||||
&mut current_chunk,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
chunks_ids.push(std::mem::replace(
|
||||
&mut current_chunk_ids,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
};
|
||||
current_chunk.push(prompt.to_owned());
|
||||
current_chunk_ids.push(docid);
|
||||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = rt
|
||||
.block_on(
|
||||
embedder
|
||||
.embed_chunks(std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks))),
|
||||
)
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::UserError::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
expected_dimension = Some(embeddings.dimension());
|
||||
}
|
||||
chunks_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = rt
|
||||
.block_on(embedder.embed_chunks(std::mem::take(&mut chunks)))
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::UserError::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
expected_dimension = Some(embeddings.dimension());
|
||||
}
|
||||
}
|
||||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = rt
|
||||
.block_on(embedder.embed(std::mem::take(&mut current_chunk)))
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::UserError::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
expected_dimension = Some(embeddings.dimension());
|
||||
}
|
||||
}
|
||||
|
||||
Ok((writer_into_reader(state_writer)?, expected_dimension))
|
||||
}
|
||||
|
|
|
@ -9,9 +9,10 @@ mod extract_word_docids;
|
|||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use log::debug;
|
||||
|
@ -23,7 +24,9 @@ use self::extract_facet_string_docids::extract_facet_string_docids;
|
|||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::extract_vector_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
|
@ -32,8 +35,10 @@ use super::helpers::{
|
|||
MergeFn, MergeableReader,
|
||||
};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::prompt::Prompt;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::{FieldId, Result};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{FieldId, FieldsIdsMap, Result};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
|
@ -47,13 +52,14 @@ pub(crate) fn data_from_obkv_documents(
|
|||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
exact_attributes: HashSet<FieldId>,
|
||||
proximity_precision: ProximityPrecision,
|
||||
embedders: HashMap<String, (Arc<Embedder>, Arc<Prompt>)>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -64,7 +70,8 @@ pub(crate) fn data_from_obkv_documents(
|
|||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
vectors_field_id,
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
|
@ -276,24 +283,42 @@ fn send_original_documents_data(
|
|||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
embedders: HashMap<String, (Arc<Embedder>, Arc<Prompt>)>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
if let Some(vectors_field_id) = vectors_field_id {
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
}
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let (embedder, prompt) = embedders.get("default").cloned().unzip();
|
||||
let result =
|
||||
extract_vector_points(documents_chunk_cloned, indexer, field_id_map, prompt.as_deref());
|
||||
let _ = match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
/// FIXME: support multiple embedders
|
||||
let results = embedder.and_then(|embedder| {
|
||||
match extract_embeddings(prompts, indexer, embedder.clone()) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
None
|
||||
}
|
||||
}
|
||||
});
|
||||
let (embeddings, expected_dimension) = results.unzip();
|
||||
let expected_dimension = expected_dimension.flatten();
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
manual_vectors,
|
||||
}))
|
||||
}
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
|
||||
// TODO: create a custom internal error
|
||||
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
||||
|
|
|
@ -4,11 +4,12 @@ mod helpers;
|
|||
mod transform;
|
||||
mod typed_chunk;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Cursor, Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use heed::types::Str;
|
||||
|
@ -32,10 +33,12 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
|||
pub use self::transform::{Transform, TransformOutput};
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::prompt::Prompt;
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
|
@ -78,6 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
|
|||
should_abort: FA,
|
||||
added_documents: u64,
|
||||
deleted_documents: u64,
|
||||
embedders: HashMap<String, (Arc<Embedder>, Arc<Prompt>)>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone)]
|
||||
|
@ -121,6 +125,7 @@ where
|
|||
index,
|
||||
added_documents: 0,
|
||||
deleted_documents: 0,
|
||||
embedders: Default::default(),
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -167,6 +172,14 @@ where
|
|||
Ok((self, Ok(indexed_documents)))
|
||||
}
|
||||
|
||||
pub fn with_embedders(
|
||||
mut self,
|
||||
embedders: HashMap<String, (Arc<Embedder>, Arc<Prompt>)>,
|
||||
) -> Self {
|
||||
self.embedders = embedders;
|
||||
self
|
||||
}
|
||||
|
||||
/// Remove a batch of documents from the current builder.
|
||||
///
|
||||
/// Returns the number of documents deleted from the builder.
|
||||
|
@ -322,17 +335,18 @@ where
|
|||
// get filterable fields for facet databases
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
||||
let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") {
|
||||
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
// self.index.fields_ids_map($a)? ==>> field_id_map
|
||||
let geo_fields_ids = match field_id_map.id("_geo") {
|
||||
Some(gfid) => {
|
||||
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||
// if `_geo` is faceted then we get the `lat` and `lng`
|
||||
if is_sortable || is_filterable {
|
||||
let field_ids = self
|
||||
.index
|
||||
.fields_ids_map(self.wtxn)?
|
||||
let field_ids = field_id_map
|
||||
.insert("_geo.lat")
|
||||
.zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng"))
|
||||
.zip(field_id_map.insert("_geo.lng"))
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
Some(field_ids)
|
||||
} else {
|
||||
|
@ -341,8 +355,6 @@ where
|
|||
}
|
||||
None => None,
|
||||
};
|
||||
// get the fid of the `_vectors` field.
|
||||
let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||
|
@ -364,6 +376,8 @@ where
|
|||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
|
||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||
|
||||
let cloned_embedder = self.embedders.clone();
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
pool.install(|| {
|
||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||
|
@ -387,13 +401,14 @@ where
|
|||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
vectors_field_id,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
});
|
||||
|
||||
|
@ -2505,7 +2520,7 @@ mod tests {
|
|||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
|
||||
let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap();
|
||||
assert_eq!(res.documents_ids.len(), 3);
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,12 @@ pub(crate) enum TypedChunk {
|
|||
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints {
|
||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
||||
expected_dimension: Option<usize>,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
},
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||
}
|
||||
|
||||
|
@ -100,8 +105,8 @@ impl TypedChunk {
|
|||
TypedChunk::GeoPoints(grenad) => {
|
||||
format!("GeoPoints {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::VectorPoints(grenad) => {
|
||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||
TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension } => {
|
||||
format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension.unwrap_or_default())
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||
|
@ -355,19 +360,64 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints(vector_points) => {
|
||||
let mut vectors_set = HashSet::new();
|
||||
TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
} => {
|
||||
if remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty())
|
||||
{
|
||||
return Ok((RoaringBitmap::new(), is_merged_database));
|
||||
}
|
||||
|
||||
let mut docid_vectors_map: HashMap<DocumentId, HashSet<Vec<OrderedFloat<f32>>>> =
|
||||
HashMap::new();
|
||||
|
||||
// We extract and store the previous vectors
|
||||
if let Some(hnsw) = index.vector_hnsw(wtxn)? {
|
||||
for (pid, point) in hnsw.iter() {
|
||||
let pid_key = pid.into_inner();
|
||||
let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap();
|
||||
let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
|
||||
vectors_set.insert((docid, vector));
|
||||
docid_vectors_map.entry(docid).or_default().insert(vector);
|
||||
}
|
||||
}
|
||||
|
||||
let mut cursor = vector_points.into_cursor()?;
|
||||
// remove vectors for docids we want them removed
|
||||
let mut cursor = remove_vectors.into_cursor()?;
|
||||
while let Some((key, _)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
docid_vectors_map.remove(&docid);
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
if let Some((embeddings, expected_dimension)) = embeddings.zip(expected_dimension) {
|
||||
let mut cursor = embeddings.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data: Vec<OrderedFloat<_>> =
|
||||
pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings =
|
||||
crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
|
||||
let mut set = HashSet::new();
|
||||
for embedding in embeddings.iter() {
|
||||
set.insert(embedding.to_vec());
|
||||
}
|
||||
|
||||
docid_vectors_map.insert(docid, set);
|
||||
}
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
let mut cursor = manual_vectors.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
|
@ -376,23 +426,30 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
let key = (docid, vector);
|
||||
if !vectors_set.remove(&key) {
|
||||
error!("Unable to delete the vector: {:?}", key.1);
|
||||
}
|
||||
let vector: Vec<OrderedFloat<_>> =
|
||||
pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
docid_vectors_map.entry(docid).and_modify(|v| {
|
||||
if !v.remove(&vector) {
|
||||
error!("Unable to delete the vector: {:?}", vector);
|
||||
}
|
||||
});
|
||||
}
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
vectors_set.insert((docid, vector));
|
||||
docid_vectors_map.entry(docid).and_modify(|v| {
|
||||
v.insert(vector);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the most common vector dimension
|
||||
let expected_dimension_size = {
|
||||
let mut dims = HashMap::new();
|
||||
vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1);
|
||||
docid_vectors_map
|
||||
.values()
|
||||
.flat_map(|v| v.iter())
|
||||
.for_each(|v| *dims.entry(v.len()).or_insert(0) += 1);
|
||||
dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
|
||||
};
|
||||
|
||||
|
@ -400,7 +457,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
// prepare the vectors before inserting them in the HNSW.
|
||||
let mut points = Vec::new();
|
||||
let mut docids = Vec::new();
|
||||
for (docid, vector) in vectors_set {
|
||||
for (docid, vector) in docid_vectors_map
|
||||
.into_iter()
|
||||
.flat_map(|(docid, vectors)| std::iter::repeat(docid).zip(vectors))
|
||||
{
|
||||
if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_dimension_size.unwrap_or(vector.len()),
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::result::Result as StdResult;
|
|||
|
||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||
use deserr::{DeserializeError, Deserr};
|
||||
use itertools::Itertools;
|
||||
use itertools::{EitherOrBoth, Itertools};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
|
@ -15,6 +15,8 @@ use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS
|
|||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
use crate::vector::settings::{EmbeddingSettings, PromptSettings};
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::{FieldsIdsMap, Index, OrderBy, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
|
||||
|
@ -73,6 +75,13 @@ impl<T> Setting<T> {
|
|||
otherwise => otherwise,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
if let Setting::NotSet = new {
|
||||
return;
|
||||
}
|
||||
*self = new;
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Serialize> Serialize for Setting<T> {
|
||||
|
@ -129,6 +138,7 @@ pub struct Settings<'a, 't, 'i> {
|
|||
sort_facet_values_by: Setting<HashMap<String, OrderBy>>,
|
||||
pagination_max_total_hits: Setting<usize>,
|
||||
proximity_precision: Setting<ProximityPrecision>,
|
||||
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
@ -161,6 +171,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
sort_facet_values_by: Setting::NotSet,
|
||||
pagination_max_total_hits: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
embedder_settings: Setting::NotSet,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
@ -343,6 +354,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.proximity_precision = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_embedder_settings(&mut self, value: BTreeMap<String, Setting<EmbeddingSettings>>) {
|
||||
self.embedder_settings = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_embedder_settings(&mut self) {
|
||||
self.embedder_settings = Setting::Reset;
|
||||
}
|
||||
|
||||
fn reindex<FP, FA>(
|
||||
&mut self,
|
||||
progress_callback: &FP,
|
||||
|
@ -890,6 +909,60 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(changed)
|
||||
}
|
||||
|
||||
fn update_embedding_configs(&mut self) -> Result<bool> {
|
||||
let update = match std::mem::take(&mut self.embedder_settings) {
|
||||
Setting::Set(configs) => {
|
||||
let mut changed = false;
|
||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
|
||||
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
|
||||
|
||||
let mut new_configs = BTreeMap::new();
|
||||
for joined in old_configs
|
||||
.into_iter()
|
||||
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
||||
{
|
||||
match joined {
|
||||
EitherOrBoth::Both((name, mut old), (_, new)) => {
|
||||
old.apply(new);
|
||||
let new = validate_prompt(&name, old)?;
|
||||
changed = true;
|
||||
new_configs.insert(name, new);
|
||||
}
|
||||
EitherOrBoth::Left((name, setting)) => {
|
||||
new_configs.insert(name, setting);
|
||||
}
|
||||
EitherOrBoth::Right((name, setting)) => {
|
||||
let setting = validate_prompt(&name, setting)?;
|
||||
changed = true;
|
||||
new_configs.insert(name, setting);
|
||||
}
|
||||
}
|
||||
}
|
||||
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
|
||||
.into_iter()
|
||||
.filter_map(|(name, setting)| match setting {
|
||||
Setting::Set(value) => Some((name, value.into())),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
|
||||
})
|
||||
.collect();
|
||||
if new_configs.is_empty() {
|
||||
self.index.delete_embedding_configs(self.wtxn)?;
|
||||
} else {
|
||||
self.index.put_embedding_configs(self.wtxn, new_configs)?;
|
||||
}
|
||||
changed
|
||||
}
|
||||
Setting::Reset => {
|
||||
self.index.delete_embedding_configs(self.wtxn)?;
|
||||
true
|
||||
}
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
Ok(update)
|
||||
}
|
||||
|
||||
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
|
@ -927,6 +1000,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
let searchable_updated = self.update_searchable()?;
|
||||
let exact_attributes_updated = self.update_exact_attributes()?;
|
||||
let proximity_precision = self.update_proximity_precision()?;
|
||||
// TODO: very rough approximation of the needs for reindexing where any change will result in
|
||||
// a full reindexing.
|
||||
// What can be done instead:
|
||||
// 1. Only change the distance on a distance change
|
||||
// 2. Only change the name -> embedder mapping on a name change
|
||||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||
|
||||
if stop_words_updated
|
||||
|| non_separator_tokens_updated
|
||||
|
@ -937,6 +1017,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
|| searchable_updated
|
||||
|| exact_attributes_updated
|
||||
|| proximity_precision
|
||||
|| embedding_configs_updated
|
||||
{
|
||||
self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
|
||||
}
|
||||
|
@ -945,6 +1026,34 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
}
|
||||
}
|
||||
|
||||
fn validate_prompt(
|
||||
name: &str,
|
||||
new: Setting<EmbeddingSettings>,
|
||||
) -> Result<Setting<EmbeddingSettings>> {
|
||||
match new {
|
||||
Setting::Set(EmbeddingSettings {
|
||||
embedder_options,
|
||||
prompt:
|
||||
Setting::Set(PromptSettings { template: Setting::Set(template), strategy, fallback }),
|
||||
}) => {
|
||||
// validate
|
||||
let template = crate::prompt::Prompt::new(template, None, None)
|
||||
.map(|prompt| crate::prompt::PromptData::from(prompt).template)
|
||||
.map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?;
|
||||
|
||||
Ok(Setting::Set(EmbeddingSettings {
|
||||
embedder_options,
|
||||
prompt: Setting::Set(PromptSettings {
|
||||
template: Setting::Set(template),
|
||||
strategy,
|
||||
fallback,
|
||||
}),
|
||||
}))
|
||||
}
|
||||
new => Ok(new),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
|
@ -1763,6 +1872,7 @@ mod tests {
|
|||
sort_facet_values_by,
|
||||
pagination_max_total_hits,
|
||||
proximity_precision,
|
||||
embedder_settings,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -1785,6 +1895,7 @@ mod tests {
|
|||
assert!(matches!(sort_facet_values_by, Setting::NotSet));
|
||||
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
|
||||
assert!(matches!(proximity_precision, Setting::NotSet));
|
||||
assert!(matches!(embedder_settings, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
229
milli/src/vector/error.rs
Normal file
229
milli/src/vector/error.rs
Normal file
|
@ -0,0 +1,229 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use hf_hub::api::sync::ApiError;
|
||||
|
||||
use crate::error::FaultSource;
|
||||
use crate::vector::openai::OpenAiError;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Error while generating embeddings: {inner}")]
|
||||
pub struct Error {
|
||||
pub inner: Box<ErrorKind>,
|
||||
}
|
||||
|
||||
impl<I: Into<ErrorKind>> From<I> for Error {
|
||||
fn from(value: I) -> Self {
|
||||
Self { inner: Box::new(value.into()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub fn fault(&self) -> FaultSource {
|
||||
match &*self.inner {
|
||||
ErrorKind::NewEmbedderError(inner) => inner.fault,
|
||||
ErrorKind::EmbedError(inner) => inner.fault,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ErrorKind {
|
||||
#[error(transparent)]
|
||||
NewEmbedderError(#[from] NewEmbedderError),
|
||||
#[error(transparent)]
|
||||
EmbedError(#[from] EmbedError),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct EmbedError {
|
||||
pub kind: EmbedErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum EmbedErrorKind {
|
||||
#[error("could not tokenize: {0}")]
|
||||
Tokenize(Box<dyn std::error::Error + Send + Sync>),
|
||||
#[error("unexpected tensor shape: {0}")]
|
||||
TensorShape(candle_core::Error),
|
||||
#[error("unexpected tensor value: {0}")]
|
||||
TensorValue(candle_core::Error),
|
||||
#[error("could not run model: {0}")]
|
||||
ModelForward(candle_core::Error),
|
||||
#[error("could not reach OpenAI: {0}")]
|
||||
OpenAiNetwork(reqwest::Error),
|
||||
#[error("unexpected response from OpenAI: {0}")]
|
||||
OpenAiUnexpected(reqwest::Error),
|
||||
#[error("could not authenticate against OpenAI: {0}")]
|
||||
OpenAiAuth(OpenAiError),
|
||||
#[error("sent too many requests to OpenAI: {0}")]
|
||||
OpenAiTooManyRequests(OpenAiError),
|
||||
#[error("received internal error from OpenAI: {0}")]
|
||||
OpenAiInternalServerError(OpenAiError),
|
||||
#[error("sent too many tokens in a request to OpenAI: {0}")]
|
||||
OpenAiTooManyTokens(OpenAiError),
|
||||
#[error("received unhandled HTTP status code {0} from OpenAI")]
|
||||
OpenAiUnhandledStatusCode(u16),
|
||||
}
|
||||
|
||||
impl EmbedError {
|
||||
pub fn tokenize(inner: Box<dyn std::error::Error + Send + Sync>) -> Self {
|
||||
Self { kind: EmbedErrorKind::Tokenize(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn tensor_shape(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::TensorShape(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn tensor_value(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::TensorValue(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn model_forward(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::ModelForward(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_network(inner: reqwest::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::OpenAiNetwork(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_unexpected(inner: reqwest::Error) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiUnexpected(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_auth_error(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiAuth(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_too_many_requests(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_too_many_tokens(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiTooManyTokens(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_unhandled_status_code(code: u16) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiUnhandledStatusCode(code), fault: FaultSource::Bug }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct NewEmbedderError {
|
||||
pub kind: NewEmbedderErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
impl NewEmbedderError {
|
||||
pub fn open_config(config_filename: PathBuf, inner: std::io::Error) -> NewEmbedderError {
|
||||
let open_config = OpenConfig { filename: config_filename, inner };
|
||||
|
||||
Self { kind: NewEmbedderErrorKind::OpenConfig(open_config), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn deserialize_config(
|
||||
config: String,
|
||||
config_filename: PathBuf,
|
||||
inner: serde_json::Error,
|
||||
) -> NewEmbedderError {
|
||||
let deserialize_config = DeserializeConfig { config, filename: config_filename, inner };
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::DeserializeConfig(deserialize_config),
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open_tokenizer(
|
||||
tokenizer_filename: PathBuf,
|
||||
inner: Box<dyn std::error::Error + Send + Sync>,
|
||||
) -> NewEmbedderError {
|
||||
let open_tokenizer = OpenTokenizer { filename: tokenizer_filename, inner };
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::OpenTokenizer(open_tokenizer),
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_api_fail(inner: ApiError) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::NewApiFail(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn api_get(inner: ApiError) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::ApiGet(inner), fault: FaultSource::Undecided }
|
||||
}
|
||||
|
||||
pub fn pytorch_weight(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::PytorchWeight(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn safetensor_weight(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::PytorchWeight(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn load_model(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::LoadModel(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_initialize_web_client(inner: reqwest::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InitWebClient(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_invalid_api_key_format(inner: reqwest::header::InvalidHeaderValue) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InvalidApiKeyFormat(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not open config at {filename:?}: {inner}")]
|
||||
pub struct OpenConfig {
|
||||
pub filename: PathBuf,
|
||||
pub inner: std::io::Error,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not deserialize config at {filename}: {inner}. Config follows:\n{config}")]
|
||||
pub struct DeserializeConfig {
|
||||
pub config: String,
|
||||
pub filename: PathBuf,
|
||||
pub inner: serde_json::Error,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not open tokenizer at {filename}: {inner}")]
|
||||
pub struct OpenTokenizer {
|
||||
pub filename: PathBuf,
|
||||
#[source]
|
||||
pub inner: Box<dyn std::error::Error + Send + Sync>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NewEmbedderErrorKind {
|
||||
// hf
|
||||
#[error(transparent)]
|
||||
OpenConfig(OpenConfig),
|
||||
#[error(transparent)]
|
||||
DeserializeConfig(DeserializeConfig),
|
||||
#[error(transparent)]
|
||||
OpenTokenizer(OpenTokenizer),
|
||||
#[error("could not build weights from Pytorch weights: {0}")]
|
||||
PytorchWeight(candle_core::Error),
|
||||
#[error("could not build weights from Safetensor weights: {0}")]
|
||||
SafetensorWeight(candle_core::Error),
|
||||
#[error("could not spawn HG_HUB API client: {0}")]
|
||||
NewApiFail(ApiError),
|
||||
#[error("fetching file from HG_HUB failed: {0}")]
|
||||
ApiGet(ApiError),
|
||||
#[error("loading model failed: {0}")]
|
||||
LoadModel(candle_core::Error),
|
||||
// openai
|
||||
#[error("initializing web client for sending embedding requests failed: {0}")]
|
||||
InitWebClient(reqwest::Error),
|
||||
#[error("The API key passed to Authorization error was in an invalid format: {0}")]
|
||||
InvalidApiKeyFormat(reqwest::header::InvalidHeaderValue),
|
||||
}
|
192
milli/src/vector/hf.rs
Normal file
192
milli/src/vector/hf.rs
Normal file
|
@ -0,0 +1,192 @@
|
|||
use candle_core::Tensor;
|
||||
use candle_nn::VarBuilder;
|
||||
use candle_transformers::models::bert::{BertModel, Config, DTYPE};
|
||||
// FIXME: currently we'll be using the hub to retrieve model, in the future we might want to embed it into Meilisearch itself
|
||||
use hf_hub::api::sync::Api;
|
||||
use hf_hub::{Repo, RepoType};
|
||||
use tokenizers::{PaddingParams, Tokenizer};
|
||||
|
||||
pub use super::error::{EmbedError, Error, NewEmbedderError};
|
||||
use super::{Embedding, Embeddings};
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Default,
|
||||
Hash,
|
||||
PartialEq,
|
||||
Eq,
|
||||
serde::Deserialize,
|
||||
serde::Serialize,
|
||||
deserr::Deserr,
|
||||
)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum WeightSource {
|
||||
#[default]
|
||||
Safetensors,
|
||||
Pytorch,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub model: String,
|
||||
pub revision: Option<String>,
|
||||
pub weight_source: WeightSource,
|
||||
pub normalize_embeddings: bool,
|
||||
}
|
||||
|
||||
impl EmbedderOptions {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
//model: "sentence-transformers/all-MiniLM-L6-v2".to_string(),
|
||||
model: "BAAI/bge-base-en-v1.5".to_string(),
|
||||
//revision: Some("refs/pr/21".to_string()),
|
||||
revision: None,
|
||||
//weight_source: Default::default(),
|
||||
weight_source: WeightSource::Pytorch,
|
||||
normalize_embeddings: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbedderOptions {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform embedding of documents and queries
|
||||
pub struct Embedder {
|
||||
model: BertModel,
|
||||
tokenizer: Tokenizer,
|
||||
options: EmbedderOptions,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Embedder {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Embedder")
|
||||
.field("model", &self.options.model)
|
||||
.field("tokenizer", &self.tokenizer)
|
||||
.field("options", &self.options)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new(options: EmbedderOptions) -> std::result::Result<Self, NewEmbedderError> {
|
||||
let device = candle_core::Device::Cpu;
|
||||
let repo = match options.revision.clone() {
|
||||
Some(revision) => Repo::with_revision(options.model.clone(), RepoType::Model, revision),
|
||||
None => Repo::model(options.model.clone()),
|
||||
};
|
||||
let (config_filename, tokenizer_filename, weights_filename) = {
|
||||
let api = Api::new().map_err(NewEmbedderError::new_api_fail)?;
|
||||
let api = api.repo(repo);
|
||||
let config = api.get("config.json").map_err(NewEmbedderError::api_get)?;
|
||||
let tokenizer = api.get("tokenizer.json").map_err(NewEmbedderError::api_get)?;
|
||||
let weights = match options.weight_source {
|
||||
WeightSource::Pytorch => {
|
||||
api.get("pytorch_model.bin").map_err(NewEmbedderError::api_get)?
|
||||
}
|
||||
WeightSource::Safetensors => {
|
||||
api.get("model.safetensors").map_err(NewEmbedderError::api_get)?
|
||||
}
|
||||
};
|
||||
(config, tokenizer, weights)
|
||||
};
|
||||
|
||||
let config = std::fs::read_to_string(&config_filename)
|
||||
.map_err(|inner| NewEmbedderError::open_config(config_filename.clone(), inner))?;
|
||||
let config: Config = serde_json::from_str(&config).map_err(|inner| {
|
||||
NewEmbedderError::deserialize_config(config, config_filename, inner)
|
||||
})?;
|
||||
let mut tokenizer = Tokenizer::from_file(&tokenizer_filename)
|
||||
.map_err(|inner| NewEmbedderError::open_tokenizer(tokenizer_filename, inner))?;
|
||||
|
||||
let vb = match options.weight_source {
|
||||
WeightSource::Pytorch => VarBuilder::from_pth(&weights_filename, DTYPE, &device)
|
||||
.map_err(NewEmbedderError::pytorch_weight)?,
|
||||
WeightSource::Safetensors => unsafe {
|
||||
VarBuilder::from_mmaped_safetensors(&[weights_filename], DTYPE, &device)
|
||||
.map_err(NewEmbedderError::safetensor_weight)?
|
||||
},
|
||||
};
|
||||
|
||||
let model = BertModel::load(vb, &config).map_err(NewEmbedderError::load_model)?;
|
||||
|
||||
if let Some(pp) = tokenizer.get_padding_mut() {
|
||||
pp.strategy = tokenizers::PaddingStrategy::BatchLongest
|
||||
} else {
|
||||
let pp = PaddingParams {
|
||||
strategy: tokenizers::PaddingStrategy::BatchLongest,
|
||||
..Default::default()
|
||||
};
|
||||
tokenizer.with_padding(Some(pp));
|
||||
}
|
||||
|
||||
Ok(Self { model, tokenizer, options })
|
||||
}
|
||||
|
||||
pub async fn embed(
|
||||
&self,
|
||||
mut texts: Vec<String>,
|
||||
) -> std::result::Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
let tokens = match texts.len() {
|
||||
1 => vec![self
|
||||
.tokenizer
|
||||
.encode(texts.pop().unwrap(), true)
|
||||
.map_err(EmbedError::tokenize)?],
|
||||
_ => self.tokenizer.encode_batch(texts, true).map_err(EmbedError::tokenize)?,
|
||||
};
|
||||
let token_ids = tokens
|
||||
.iter()
|
||||
.map(|tokens| {
|
||||
let tokens = tokens.get_ids().to_vec();
|
||||
Tensor::new(tokens.as_slice(), &self.model.device).map_err(EmbedError::tensor_shape)
|
||||
})
|
||||
.collect::<Result<Vec<_>, EmbedError>>()?;
|
||||
|
||||
let token_ids = Tensor::stack(&token_ids, 0).map_err(EmbedError::tensor_shape)?;
|
||||
let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?;
|
||||
let embeddings =
|
||||
self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?;
|
||||
|
||||
// Apply some avg-pooling by taking the mean embedding value for all tokens (including padding)
|
||||
let (_n_sentence, n_tokens, _hidden_size) =
|
||||
embeddings.dims3().map_err(EmbedError::tensor_shape)?;
|
||||
|
||||
let embeddings = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64))
|
||||
.map_err(EmbedError::tensor_shape)?;
|
||||
|
||||
let embeddings: Tensor = if self.options.normalize_embeddings {
|
||||
normalize_l2(&embeddings).map_err(EmbedError::tensor_value)?
|
||||
} else {
|
||||
embeddings
|
||||
};
|
||||
|
||||
let embeddings: Vec<Embedding> = embeddings.to_vec2().map_err(EmbedError::tensor_shape)?;
|
||||
Ok(embeddings.into_iter().map(Embeddings::from_single_embedding).collect())
|
||||
}
|
||||
|
||||
pub async fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
futures::future::try_join_all(text_chunks.into_iter().map(|prompts| self.embed(prompts)))
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
1
|
||||
}
|
||||
|
||||
pub fn prompt_count_in_chunk_hint(&self) -> usize {
|
||||
std::thread::available_parallelism().map(|x| x.get()).unwrap_or(8)
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_l2(v: &Tensor) -> Result<Tensor, candle_core::Error> {
|
||||
v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)
|
||||
}
|
142
milli/src/vector/mod.rs
Normal file
142
milli/src/vector/mod.rs
Normal file
|
@ -0,0 +1,142 @@
|
|||
use self::error::{EmbedError, NewEmbedderError};
|
||||
use crate::prompt::PromptData;
|
||||
|
||||
pub mod error;
|
||||
pub mod hf;
|
||||
pub mod openai;
|
||||
pub mod settings;
|
||||
|
||||
pub use self::error::Error;
|
||||
|
||||
pub type Embedding = Vec<f32>;
|
||||
|
||||
pub struct Embeddings<F> {
|
||||
data: Vec<F>,
|
||||
dimension: usize,
|
||||
}
|
||||
|
||||
impl<F> Embeddings<F> {
|
||||
pub fn new(dimension: usize) -> Self {
|
||||
Self { data: Default::default(), dimension }
|
||||
}
|
||||
|
||||
pub fn from_single_embedding(embedding: Vec<F>) -> Self {
|
||||
Self { dimension: embedding.len(), data: embedding }
|
||||
}
|
||||
|
||||
pub fn from_inner(data: Vec<F>, dimension: usize) -> Result<Self, Vec<F>> {
|
||||
let mut this = Self::new(dimension);
|
||||
this.append(data)?;
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn dimension(&self) -> usize {
|
||||
self.dimension
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Vec<F> {
|
||||
self.data
|
||||
}
|
||||
|
||||
pub fn as_inner(&self) -> &[F] {
|
||||
&self.data
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = &'_ [F]> + '_ {
|
||||
self.data.as_slice().chunks_exact(self.dimension)
|
||||
}
|
||||
|
||||
pub fn push(&mut self, mut embedding: Vec<F>) -> Result<(), Vec<F>> {
|
||||
if embedding.len() != self.dimension {
|
||||
return Err(embedding);
|
||||
}
|
||||
self.data.append(&mut embedding);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn append(&mut self, mut embeddings: Vec<F>) -> Result<(), Vec<F>> {
|
||||
if embeddings.len() % self.dimension != 0 {
|
||||
return Err(embeddings);
|
||||
}
|
||||
self.data.append(&mut embeddings);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Embedder {
|
||||
HuggingFace(hf::Embedder),
|
||||
OpenAi(openai::Embedder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbeddingConfig {
|
||||
pub embedder_options: EmbedderOptions,
|
||||
pub prompt: PromptData,
|
||||
// TODO: add metrics and anything needed
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub enum EmbedderOptions {
|
||||
HuggingFace(hf::EmbedderOptions),
|
||||
OpenAi(openai::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl Default for EmbedderOptions {
|
||||
fn default() -> Self {
|
||||
Self::HuggingFace(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbedderOptions {
|
||||
pub fn huggingface() -> Self {
|
||||
Self::HuggingFace(hf::EmbedderOptions::new())
|
||||
}
|
||||
|
||||
pub fn openai(api_key: String) -> Self {
|
||||
Self::OpenAi(openai::EmbedderOptions::with_default_model(api_key))
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new(options: EmbedderOptions) -> std::result::Result<Self, NewEmbedderError> {
|
||||
Ok(match options {
|
||||
EmbedderOptions::HuggingFace(options) => Self::HuggingFace(hf::Embedder::new(options)?),
|
||||
EmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn embed(
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
) -> std::result::Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed(texts).await,
|
||||
Embedder::OpenAi(embedder) => embedder.embed(texts).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks).await,
|
||||
Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn prompt_count_in_chunk_hint(&self) -> usize {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
Embedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
}
|
||||
}
|
||||
}
|
416
milli/src/vector/openai.rs
Normal file
416
milli/src/vector/openai.rs
Normal file
|
@ -0,0 +1,416 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
use reqwest::StatusCode;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::error::{EmbedError, NewEmbedderError};
|
||||
use super::{Embedding, Embeddings};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Embedder {
|
||||
client: reqwest::Client,
|
||||
tokenizer: tiktoken_rs::CoreBPE,
|
||||
options: EmbedderOptions,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub api_key: String,
|
||||
pub embedding_model: EmbeddingModel,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Default,
|
||||
Hash,
|
||||
PartialEq,
|
||||
Eq,
|
||||
serde::Serialize,
|
||||
serde::Deserialize,
|
||||
deserr::Deserr,
|
||||
)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum EmbeddingModel {
|
||||
#[default]
|
||||
TextEmbeddingAda002,
|
||||
}
|
||||
|
||||
impl EmbeddingModel {
|
||||
pub fn max_token(&self) -> usize {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => 8191,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dimensions(&self) -> usize {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => 1536,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => "text-embedding-ada-002",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_name(name: &'static str) -> Option<Self> {
|
||||
match name {
|
||||
"text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";
|
||||
|
||||
impl EmbedderOptions {
|
||||
pub fn with_default_model(api_key: String) -> Self {
|
||||
Self { api_key, embedding_model: Default::default() }
|
||||
}
|
||||
|
||||
pub fn with_embedding_model(api_key: String, embedding_model: EmbeddingModel) -> Self {
|
||||
Self { api_key, embedding_model }
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new(options: EmbedderOptions) -> Result<Self, NewEmbedderError> {
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
headers.insert(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
reqwest::header::HeaderValue::from_str(&format!("Bearer {}", &options.api_key))
|
||||
.map_err(NewEmbedderError::openai_invalid_api_key_format)?,
|
||||
);
|
||||
headers.insert(
|
||||
reqwest::header::CONTENT_TYPE,
|
||||
reqwest::header::HeaderValue::from_static("application/json"),
|
||||
);
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.default_headers(headers)
|
||||
.build()
|
||||
.map_err(NewEmbedderError::openai_initialize_web_client)?;
|
||||
|
||||
// looking at the code it is very unclear that this can actually fail.
|
||||
let tokenizer = tiktoken_rs::cl100k_base().unwrap();
|
||||
|
||||
Ok(Self { options, client, tokenizer })
|
||||
}
|
||||
|
||||
pub async fn embed(&self, texts: Vec<String>) -> Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
let mut tokenized = false;
|
||||
|
||||
for attempt in 0..7 {
|
||||
let result = if tokenized {
|
||||
self.try_embed_tokenized(&texts).await
|
||||
} else {
|
||||
self.try_embed(&texts).await
|
||||
};
|
||||
|
||||
let retry_duration = match result {
|
||||
Ok(embeddings) => return Ok(embeddings),
|
||||
Err(retry) => {
|
||||
log::warn!("Failed: {}", retry.error);
|
||||
tokenized |= retry.must_tokenize();
|
||||
retry.into_duration(attempt)
|
||||
}
|
||||
}?;
|
||||
log::warn!("Attempt #{}, retrying after {}ms.", attempt, retry_duration.as_millis());
|
||||
tokio::time::sleep(retry_duration).await;
|
||||
}
|
||||
|
||||
let result = if tokenized {
|
||||
self.try_embed_tokenized(&texts).await
|
||||
} else {
|
||||
self.try_embed(&texts).await
|
||||
};
|
||||
|
||||
result.map_err(Retry::into_error)
|
||||
}
|
||||
|
||||
async fn check_response(response: reqwest::Response) -> Result<reqwest::Response, Retry> {
|
||||
if !response.status().is_success() {
|
||||
match response.status() {
|
||||
StatusCode::UNAUTHORIZED => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
return Err(Retry::give_up(EmbedError::openai_auth_error(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::TOO_MANY_REQUESTS => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
return Err(Retry::rate_limited(EmbedError::openai_too_many_requests(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::INTERNAL_SERVER_ERROR => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::BAD_REQUEST => {
|
||||
// Most probably, one text contained too many tokens
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
log::warn!("OpenAI: input was too long, retrying on tokenized version. For best performance, limit the size of your prompt.");
|
||||
|
||||
return Err(Retry::retry_tokenized(EmbedError::openai_too_many_tokens(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
code => {
|
||||
return Err(Retry::give_up(EmbedError::openai_unhandled_status_code(
|
||||
code.as_u16(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn try_embed<S: AsRef<str> + serde::Serialize>(
|
||||
&self,
|
||||
texts: &[S],
|
||||
) -> Result<Vec<Embeddings<f32>>, Retry> {
|
||||
for text in texts {
|
||||
log::trace!("Received prompt: {}", text.as_ref())
|
||||
}
|
||||
let request = OpenAiRequest { model: self.options.embedding_model.name(), input: texts };
|
||||
let response = self
|
||||
.client
|
||||
.post(OPENAI_EMBEDDINGS_URL)
|
||||
.json(&request)
|
||||
.send()
|
||||
.await
|
||||
.map_err(EmbedError::openai_network)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
let response = Self::check_response(response).await?;
|
||||
|
||||
let response: OpenAiResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
log::trace!("response: {:?}", response.data);
|
||||
|
||||
Ok(response
|
||||
.data
|
||||
.into_iter()
|
||||
.map(|data| Embeddings::from_single_embedding(data.embedding))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn try_embed_tokenized(&self, text: &[String]) -> Result<Vec<Embeddings<f32>>, Retry> {
|
||||
pub const OVERLAP_SIZE: usize = 200;
|
||||
let mut all_embeddings = Vec::with_capacity(text.len());
|
||||
for text in text {
|
||||
let max_token_count = self.options.embedding_model.max_token();
|
||||
let encoded = self.tokenizer.encode_ordinary(text.as_str());
|
||||
let len = encoded.len();
|
||||
if len < max_token_count {
|
||||
all_embeddings.append(&mut self.try_embed(&[text]).await?);
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut tokens = encoded.as_slice();
|
||||
let mut embeddings_for_prompt =
|
||||
Embeddings::new(self.options.embedding_model.dimensions());
|
||||
while tokens.len() > max_token_count {
|
||||
let window = &tokens[..max_token_count];
|
||||
embeddings_for_prompt.push(self.embed_tokens(window).await?).unwrap();
|
||||
|
||||
tokens = &tokens[max_token_count - OVERLAP_SIZE..];
|
||||
}
|
||||
|
||||
// end of text
|
||||
embeddings_for_prompt.push(self.embed_tokens(tokens).await?).unwrap();
|
||||
|
||||
all_embeddings.push(embeddings_for_prompt);
|
||||
}
|
||||
Ok(all_embeddings)
|
||||
}
|
||||
|
||||
async fn embed_tokens(&self, tokens: &[usize]) -> Result<Embedding, Retry> {
|
||||
for attempt in 0..9 {
|
||||
let duration = match self.try_embed_tokens(tokens).await {
|
||||
Ok(embedding) => return Ok(embedding),
|
||||
Err(retry) => retry.into_duration(attempt),
|
||||
}
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
tokio::time::sleep(duration).await;
|
||||
}
|
||||
|
||||
self.try_embed_tokens(tokens).await.map_err(|retry| Retry::give_up(retry.into_error()))
|
||||
}
|
||||
|
||||
async fn try_embed_tokens(&self, tokens: &[usize]) -> Result<Embedding, Retry> {
|
||||
let request =
|
||||
OpenAiTokensRequest { model: self.options.embedding_model.name(), input: tokens };
|
||||
let response = self
|
||||
.client
|
||||
.post(OPENAI_EMBEDDINGS_URL)
|
||||
.json(&request)
|
||||
.send()
|
||||
.await
|
||||
.map_err(EmbedError::openai_network)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
let response = Self::check_response(response).await?;
|
||||
|
||||
let mut response: OpenAiResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
Ok(response.data.pop().map(|data| data.embedding).unwrap_or_default())
|
||||
}
|
||||
|
||||
pub async fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
futures::future::try_join_all(text_chunks.into_iter().map(|prompts| self.embed(prompts)))
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
10
|
||||
}
|
||||
|
||||
pub fn prompt_count_in_chunk_hint(&self) -> usize {
|
||||
10
|
||||
}
|
||||
}
|
||||
|
||||
// retrying in case of failure
|
||||
|
||||
struct Retry {
|
||||
error: EmbedError,
|
||||
strategy: RetryStrategy,
|
||||
}
|
||||
|
||||
enum RetryStrategy {
|
||||
GiveUp,
|
||||
Retry,
|
||||
RetryTokenized,
|
||||
RetryAfterRateLimit,
|
||||
}
|
||||
|
||||
impl Retry {
|
||||
fn give_up(error: EmbedError) -> Self {
|
||||
Self { error, strategy: RetryStrategy::GiveUp }
|
||||
}
|
||||
|
||||
fn retry_later(error: EmbedError) -> Self {
|
||||
Self { error, strategy: RetryStrategy::Retry }
|
||||
}
|
||||
|
||||
fn retry_tokenized(error: EmbedError) -> Self {
|
||||
Self { error, strategy: RetryStrategy::RetryTokenized }
|
||||
}
|
||||
|
||||
fn rate_limited(error: EmbedError) -> Self {
|
||||
Self { error, strategy: RetryStrategy::RetryAfterRateLimit }
|
||||
}
|
||||
|
||||
fn into_duration(self, attempt: u32) -> Result<tokio::time::Duration, EmbedError> {
|
||||
match self.strategy {
|
||||
RetryStrategy::GiveUp => Err(self.error),
|
||||
RetryStrategy::Retry => Ok(tokio::time::Duration::from_millis((10u64).pow(attempt))),
|
||||
RetryStrategy::RetryTokenized => Ok(tokio::time::Duration::from_millis(1)),
|
||||
RetryStrategy::RetryAfterRateLimit => {
|
||||
Ok(tokio::time::Duration::from_millis(100 + 10u64.pow(attempt)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn must_tokenize(&self) -> bool {
|
||||
matches!(self.strategy, RetryStrategy::RetryTokenized)
|
||||
}
|
||||
|
||||
fn into_error(self) -> EmbedError {
|
||||
self.error
|
||||
}
|
||||
}
|
||||
|
||||
// openai api structs
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct OpenAiRequest<'a, S: AsRef<str> + serde::Serialize> {
|
||||
model: &'a str,
|
||||
input: &'a [S],
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct OpenAiTokensRequest<'a> {
|
||||
model: &'a str,
|
||||
input: &'a [usize],
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct OpenAiResponse {
|
||||
data: Vec<OpenAiEmbedding>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct OpenAiErrorResponse {
|
||||
error: OpenAiError,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct OpenAiError {
|
||||
message: String,
|
||||
// type: String,
|
||||
code: Option<String>,
|
||||
}
|
||||
|
||||
impl Display for OpenAiError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match &self.code {
|
||||
Some(code) => write!(f, "{} ({})", self.message, code),
|
||||
None => write!(f, "{}", self.message),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct OpenAiEmbedding {
|
||||
embedding: Embedding,
|
||||
// object: String,
|
||||
// index: usize,
|
||||
}
|
308
milli/src/vector/settings.rs
Normal file
308
milli/src/vector/settings.rs
Normal file
|
@ -0,0 +1,308 @@
|
|||
use deserr::Deserr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::prompt::{PromptData, PromptFallbackStrategy};
|
||||
use crate::update::Setting;
|
||||
use crate::vector::hf::WeightSource;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct EmbeddingSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "source")]
|
||||
#[deserr(default, rename = "source")]
|
||||
pub embedder_options: Setting<EmbedderSettings>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub prompt: Setting<PromptSettings>,
|
||||
}
|
||||
|
||||
impl EmbeddingSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let EmbeddingSettings { embedder_options, prompt } = new;
|
||||
self.embedder_options.apply(embedder_options);
|
||||
self.prompt.apply(prompt);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
fn from(value: EmbeddingConfig) -> Self {
|
||||
Self {
|
||||
embedder_options: Setting::Set(value.embedder_options.into()),
|
||||
prompt: Setting::Set(value.prompt.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EmbeddingSettings> for EmbeddingConfig {
|
||||
fn from(value: EmbeddingSettings) -> Self {
|
||||
let mut this = Self::default();
|
||||
let EmbeddingSettings { embedder_options, prompt } = value;
|
||||
if let Some(embedder_options) = embedder_options.set() {
|
||||
this.embedder_options = embedder_options.into();
|
||||
}
|
||||
if let Some(prompt) = prompt.set() {
|
||||
this.prompt = prompt.into();
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct PromptSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub template: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub strategy: Setting<PromptFallbackStrategy>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub fallback: Setting<String>,
|
||||
}
|
||||
|
||||
impl PromptSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let PromptSettings { template, strategy, fallback } = new;
|
||||
self.template.apply(template);
|
||||
self.strategy.apply(strategy);
|
||||
self.fallback.apply(fallback);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PromptData> for PromptSettings {
|
||||
fn from(value: PromptData) -> Self {
|
||||
Self {
|
||||
template: Setting::Set(value.template),
|
||||
strategy: Setting::Set(value.strategy),
|
||||
fallback: Setting::Set(value.fallback),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PromptSettings> for PromptData {
|
||||
fn from(value: PromptSettings) -> Self {
|
||||
let mut this = PromptData::default();
|
||||
let PromptSettings { template, strategy, fallback } = value;
|
||||
if let Some(template) = template.set() {
|
||||
this.template = template;
|
||||
}
|
||||
if let Some(strategy) = strategy.set() {
|
||||
this.strategy = strategy;
|
||||
}
|
||||
if let Some(fallback) = fallback.set() {
|
||||
this.fallback = fallback;
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
pub enum EmbedderSettings {
|
||||
HuggingFace(Setting<HfEmbedderSettings>),
|
||||
OpenAi(Setting<OpenAiEmbedderSettings>),
|
||||
}
|
||||
|
||||
impl<E> Deserr<E> for EmbedderSettings
|
||||
where
|
||||
E: deserr::DeserializeError,
|
||||
{
|
||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||
value: deserr::Value<V>,
|
||||
location: deserr::ValuePointerRef,
|
||||
) -> Result<Self, E> {
|
||||
match value {
|
||||
deserr::Value::Map(map) => {
|
||||
if deserr::Map::len(&map) != 1 {
|
||||
return Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::Unexpected {
|
||||
msg: format!(
|
||||
"Expected a single field, got {} fields",
|
||||
deserr::Map::len(&map)
|
||||
),
|
||||
},
|
||||
location,
|
||||
)));
|
||||
}
|
||||
let mut it = deserr::Map::into_iter(map);
|
||||
let (k, v) = it.next().unwrap();
|
||||
|
||||
match k.as_str() {
|
||||
"huggingFace" => Ok(EmbedderSettings::HuggingFace(Setting::Set(
|
||||
HfEmbedderSettings::deserialize_from_value(
|
||||
v.into_value(),
|
||||
location.push_key(&k),
|
||||
)?,
|
||||
))),
|
||||
"openAi" => Ok(EmbedderSettings::OpenAi(Setting::Set(
|
||||
OpenAiEmbedderSettings::deserialize_from_value(
|
||||
v.into_value(),
|
||||
location.push_key(&k),
|
||||
)?,
|
||||
))),
|
||||
other => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::UnknownKey {
|
||||
key: other,
|
||||
accepted: &["huggingFace", "openAi"],
|
||||
},
|
||||
location,
|
||||
))),
|
||||
}
|
||||
}
|
||||
_ => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::IncorrectValueKind {
|
||||
actual: value,
|
||||
accepted: &[deserr::ValueKind::Map],
|
||||
},
|
||||
location,
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbedderSettings {
|
||||
fn default() -> Self {
|
||||
Self::HuggingFace(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::EmbedderOptions> for EmbedderSettings {
|
||||
fn from(value: crate::vector::EmbedderOptions) -> Self {
|
||||
match value {
|
||||
crate::vector::EmbedderOptions::HuggingFace(hf) => {
|
||||
Self::HuggingFace(Setting::Set(hf.into()))
|
||||
}
|
||||
crate::vector::EmbedderOptions::OpenAi(openai) => {
|
||||
Self::OpenAi(Setting::Set(openai.into()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EmbedderSettings> for crate::vector::EmbedderOptions {
|
||||
fn from(value: EmbedderSettings) -> Self {
|
||||
match value {
|
||||
EmbedderSettings::HuggingFace(Setting::Set(hf)) => Self::HuggingFace(hf.into()),
|
||||
EmbedderSettings::HuggingFace(_setting) => Self::HuggingFace(Default::default()),
|
||||
EmbedderSettings::OpenAi(Setting::Set(ai)) => Self::OpenAi(ai.into()),
|
||||
EmbedderSettings::OpenAi(_setting) => Self::OpenAi(
|
||||
crate::vector::openai::EmbedderOptions::with_default_model(infer_api_key()),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct HfEmbedderSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub model: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub revision: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub weight_source: Setting<WeightSource>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub normalize_embeddings: Setting<bool>,
|
||||
}
|
||||
|
||||
impl HfEmbedderSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let HfEmbedderSettings {
|
||||
model,
|
||||
revision,
|
||||
weight_source,
|
||||
normalize_embeddings: normalize_embedding,
|
||||
} = new;
|
||||
self.model.apply(model);
|
||||
self.revision.apply(revision);
|
||||
self.weight_source.apply(weight_source);
|
||||
self.normalize_embeddings.apply(normalize_embedding);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::hf::EmbedderOptions> for HfEmbedderSettings {
|
||||
fn from(value: crate::vector::hf::EmbedderOptions) -> Self {
|
||||
Self {
|
||||
model: Setting::Set(value.model),
|
||||
revision: value.revision.map(Setting::Set).unwrap_or(Setting::NotSet),
|
||||
weight_source: Setting::Set(value.weight_source),
|
||||
normalize_embeddings: Setting::Set(value.normalize_embeddings),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HfEmbedderSettings> for crate::vector::hf::EmbedderOptions {
|
||||
fn from(value: HfEmbedderSettings) -> Self {
|
||||
let HfEmbedderSettings { model, revision, weight_source, normalize_embeddings } = value;
|
||||
let mut this = Self::default();
|
||||
if let Some(model) = model.set() {
|
||||
this.model = model;
|
||||
}
|
||||
if let Some(revision) = revision.set() {
|
||||
this.revision = Some(revision);
|
||||
}
|
||||
if let Some(weight_source) = weight_source.set() {
|
||||
this.weight_source = weight_source;
|
||||
}
|
||||
if let Some(normalize_embeddings) = normalize_embeddings.set() {
|
||||
this.normalize_embeddings = normalize_embeddings;
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct OpenAiEmbedderSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub api_key: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub embedding_model: Setting<crate::vector::openai::EmbeddingModel>,
|
||||
}
|
||||
|
||||
impl OpenAiEmbedderSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let Self { api_key, embedding_model: embedding_mode } = new;
|
||||
self.api_key.apply(api_key);
|
||||
self.embedding_model.apply(embedding_mode);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::openai::EmbedderOptions> for OpenAiEmbedderSettings {
|
||||
fn from(value: crate::vector::openai::EmbedderOptions) -> Self {
|
||||
Self {
|
||||
api_key: Setting::Set(value.api_key),
|
||||
embedding_model: Setting::Set(value.embedding_model),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenAiEmbedderSettings> for crate::vector::openai::EmbedderOptions {
|
||||
fn from(value: OpenAiEmbedderSettings) -> Self {
|
||||
let OpenAiEmbedderSettings { api_key, embedding_model } = value;
|
||||
Self {
|
||||
api_key: api_key.set().unwrap_or_else(infer_api_key),
|
||||
embedding_model: embedding_model.set().unwrap_or_default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn infer_api_key() -> String {
|
||||
/// FIXME: get key from instance options?
|
||||
std::env::var("MEILI_OPENAI_API_KEY").unwrap_or_default()
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue