mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge pull request #5659 from meilisearch/tmp-release-v1.15.1
Bring back v1.15.0 and v1.15.1 changes
This commit is contained in:
commit
c3368e6859
201 changed files with 4015 additions and 1188 deletions
|
@ -1,9 +1,6 @@
|
|||
use std::mem;
|
||||
|
||||
use heed::Database;
|
||||
use heed::DatabaseStat;
|
||||
use heed::RoTxn;
|
||||
use heed::Unspecified;
|
||||
use heed::{Database, DatabaseStat, RoTxn, Unspecified};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::BEU32;
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
use heed::{
|
||||
types::{SerdeJson, Str},
|
||||
RoTxn, RwTxn,
|
||||
};
|
||||
use heed::types::{SerdeJson, Str};
|
||||
use heed::{RoTxn, RwTxn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{index::main_key, Index};
|
||||
use crate::index::main_key;
|
||||
use crate::Index;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
|
@ -33,13 +32,6 @@ impl Index {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
|
||||
.delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl DisabledTyposTerms {
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::convert::Infallible;
|
||||
use std::fmt::Write;
|
||||
use std::{io, str};
|
||||
|
@ -387,6 +386,8 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
|||
DocumentEditionRuntimeError(Box<EvalAltResult>),
|
||||
#[error("Document edition runtime error encountered while compiling the function: {0}")]
|
||||
DocumentEditionCompilationError(rhai::ParseError),
|
||||
#[error("`.chat.documentTemplateMaxBytes`: `documentTemplateMaxBytes` cannot be zero")]
|
||||
InvalidChatSettingsDocumentTemplateMaxBytes,
|
||||
#[error("{0}")]
|
||||
DocumentEmbeddingError(String),
|
||||
}
|
||||
|
|
|
@ -32,13 +32,13 @@ impl ExternalDocumentsIds {
|
|||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
external_id: A,
|
||||
) -> heed::Result<Option<u32>> {
|
||||
) -> heed::Result<Option<DocumentId>> {
|
||||
self.0.get(rtxn, external_id.as_ref())
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashMap<String, u32>> {
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashMap<String, DocumentId>> {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
|
|
|
@ -7,6 +7,7 @@ use crate::FieldId;
|
|||
mod global;
|
||||
pub mod metadata;
|
||||
pub use global::GlobalFieldsIdsMap;
|
||||
pub use metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FieldsIdsMap {
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use deserr::{DeserializeError, Deserr, ValuePointerRef};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::{
|
||||
attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch},
|
||||
constants::RESERVED_GEO_FIELD_NAME,
|
||||
AttributePatterns,
|
||||
};
|
||||
use crate::attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch};
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::AttributePatterns;
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)]
|
||||
#[serde(untagged)]
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
use deserr::Deserr;
|
||||
use heed::types::*;
|
||||
use heed::{CompactionOption, Database, DatabaseStat, RoTxn, RwTxn, Unspecified, WithoutTls};
|
||||
use indexmap::IndexMap;
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::constants::{self, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::database_stats::DatabaseStats;
|
||||
|
@ -23,7 +27,9 @@ use crate::heed_codec::facet::{
|
|||
use crate::heed_codec::version::VersionCodec;
|
||||
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::prompt::PromptData;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::StdResult;
|
||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
|
@ -79,6 +85,7 @@ pub mod main_key {
|
|||
pub const PREFIX_SEARCH: &str = "prefix_search";
|
||||
pub const DOCUMENTS_STATS: &str = "documents_stats";
|
||||
pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms";
|
||||
pub const CHAT: &str = "chat";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
|
@ -1691,6 +1698,25 @@ impl Index {
|
|||
self.main.remap_key_type::<Str>().delete(txn, main_key::FACET_SEARCH)
|
||||
}
|
||||
|
||||
pub fn chat_config(&self, txn: &RoTxn<'_>) -> heed::Result<ChatConfig> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<_>>()
|
||||
.get(txn, main_key::CHAT)
|
||||
.map(|o| o.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_chat_config(
|
||||
&self,
|
||||
txn: &mut RwTxn<'_>,
|
||||
val: &ChatConfig,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<_>>().put(txn, main_key::CHAT, &val)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_chat_config(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(txn, main_key::CHAT)
|
||||
}
|
||||
|
||||
pub fn localized_attributes_rules(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
|
@ -1917,13 +1943,99 @@ pub struct IndexEmbeddingConfig {
|
|||
pub user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Deserialize, Serialize)]
|
||||
pub struct ChatConfig {
|
||||
pub description: String,
|
||||
/// Contains the document template and max template length.
|
||||
pub prompt: PromptData,
|
||||
pub search_parameters: SearchParameters,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SearchParameters {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub hybrid: Option<HybridQuery>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub limit: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort: Option<Vec<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub distinct: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub matching_strategy: Option<MatchingStrategy>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub attributes_to_search_on: Option<Vec<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub ranking_score_threshold: Option<RankingScoreThreshold>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, Deserialize, Serialize, PartialEq, Deserr, ToSchema)]
|
||||
#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSettingsRankingScoreThreshold)]
|
||||
pub struct RankingScoreThreshold(f64);
|
||||
|
||||
impl RankingScoreThreshold {
|
||||
pub fn as_f64(&self) -> f64 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<f64> for RankingScoreThreshold {
|
||||
type Error = InvalidSettingsRankingScoreThreshold;
|
||||
|
||||
fn try_from(value: f64) -> StdResult<Self, Self::Error> {
|
||||
if !(0.0..=1.0).contains(&value) {
|
||||
Err(InvalidSettingsRankingScoreThreshold)
|
||||
} else {
|
||||
Ok(RankingScoreThreshold(value))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidSettingsRankingScoreThreshold;
|
||||
|
||||
impl Error for InvalidSettingsRankingScoreThreshold {}
|
||||
|
||||
impl fmt::Display for InvalidSettingsRankingScoreThreshold {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`."
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct HybridQuery {
|
||||
pub semantic_ratio: f32,
|
||||
pub embedder: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct PrefixSettings {
|
||||
pub prefix_count_threshold: usize,
|
||||
pub max_prefix_length: usize,
|
||||
pub compute_prefixes: PrefixSearch,
|
||||
}
|
||||
|
||||
/// This is unfortunately a duplication of the struct in <meilisearch/src/search/mod.rs>.
|
||||
/// The reason why it is duplicated is because milli cannot depend on meilisearch. It would be cyclic imports.
|
||||
#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Deserr, ToSchema, Serialize, Deserialize)]
|
||||
#[deserr(rename_all = camelCase)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum MatchingStrategy {
|
||||
/// Remove query words from last to first
|
||||
#[default]
|
||||
Last,
|
||||
/// All query words are mandatory
|
||||
All,
|
||||
/// Remove query words from the most frequent to the least
|
||||
Frequency,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum PrefixSearch {
|
||||
|
|
|
@ -52,18 +52,19 @@ pub use search::new::{
|
|||
};
|
||||
use serde_json::Value;
|
||||
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
pub use {charabia as tokenizer, heed, rhai};
|
||||
pub use {arroy, charabia as tokenizer, heed, rhai};
|
||||
|
||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||
pub use self::attribute_patterns::AttributePatterns;
|
||||
pub use self::attribute_patterns::PatternMatch;
|
||||
pub use self::attribute_patterns::{AttributePatterns, PatternMatch};
|
||||
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
||||
pub use self::error::{
|
||||
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
|
||||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
|
||||
pub use self::fields_ids_map::{
|
||||
FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, MetadataBuilder,
|
||||
};
|
||||
pub use self::filterable_attributes_rules::{
|
||||
FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns,
|
||||
FilterableAttributesRule,
|
||||
|
@ -84,8 +85,6 @@ pub use self::search::{
|
|||
};
|
||||
pub use self::update::ChannelCongestion;
|
||||
|
||||
pub use arroy;
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
||||
pub type Attribute = u32;
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
use enum_iterator::Sequence;
|
||||
use std::any::TypeId;
|
||||
use std::borrow::Cow;
|
||||
use std::marker::PhantomData;
|
||||
|
@ -6,6 +5,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
|||
use std::sync::{Arc, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use enum_iterator::Sequence;
|
||||
use indexmap::IndexMap;
|
||||
use itertools::Itertools;
|
||||
use serde::Serialize;
|
||||
|
|
|
@ -18,6 +18,7 @@ impl NewPromptError {
|
|||
Self { kind: NewPromptErrorKind::CannotParseTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
#[allow(unused)] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
|
||||
pub(crate) fn invalid_fields_in_template(inner: liquid::Error) -> NewPromptError {
|
||||
Self { kind: NewPromptErrorKind::InvalidFieldsInTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
@ -27,6 +28,7 @@ impl NewPromptError {
|
|||
pub enum NewPromptErrorKind {
|
||||
#[error("cannot parse template: {0}")]
|
||||
CannotParseTemplate(liquid::Error),
|
||||
#[allow(unused)] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
|
||||
#[error("template contains invalid fields: {0}. Only `doc.*`, `fields[i].name`, `fields[i].value` are supported")]
|
||||
InvalidFieldsInTemplate(liquid::Error),
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@ mod context;
|
|||
mod document;
|
||||
pub(crate) mod error;
|
||||
mod fields;
|
||||
mod template_checker;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::convert::TryFrom;
|
||||
|
@ -65,7 +64,7 @@ fn default_template() -> liquid::Template {
|
|||
new_template(default_template_text()).unwrap()
|
||||
}
|
||||
|
||||
fn default_template_text() -> &'static str {
|
||||
pub fn default_template_text() -> &'static str {
|
||||
"{% for field in fields %}\
|
||||
{% if field.is_searchable and field.value != nil %}\
|
||||
{{ field.name }}: {{ field.value }}\n\
|
||||
|
@ -105,11 +104,6 @@ impl Prompt {
|
|||
max_bytes,
|
||||
};
|
||||
|
||||
// render template with special object that's OK with `doc.*` and `fields.*`
|
||||
this.template
|
||||
.render(&template_checker::TemplateChecker)
|
||||
.map_err(NewPromptError::invalid_fields_in_template)?;
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
|
@ -206,6 +200,7 @@ mod test {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
|
||||
fn template_missing_doc() {
|
||||
assert!(matches!(
|
||||
Prompt::new("{{title}}: {{overview}}".into(), None),
|
||||
|
@ -236,6 +231,7 @@ mod test {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
|
||||
fn template_fields_invalid() {
|
||||
assert!(matches!(
|
||||
// intentionally garbled field
|
||||
|
|
|
@ -1,301 +0,0 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{Object, ObjectView, ValueView};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TemplateChecker;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyDoc;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyFields;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyField;
|
||||
|
||||
const DUMMY_VALUE: &LiquidValue = &LiquidValue::Nil;
|
||||
|
||||
impl ObjectView for DummyField {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["name", "value"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(vec![DUMMY_VALUE.as_view(), DUMMY_VALUE.as_view()].into_iter())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "name" || index == "value"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
if self.contains_key(index) {
|
||||
Some(DUMMY_VALUE.as_view())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyField {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
let mut this = Object::new();
|
||||
this.insert("name".into(), LiquidValue::Nil);
|
||||
this.insert("value".into(), LiquidValue::Nil);
|
||||
LiquidValue::Object(this)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyFields {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"array"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Array(vec![DummyField.to_value()])
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn ArrayView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ArrayView for DummyFields {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
u16::MAX as i64
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::once(DummyField.as_value()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: i64) -> bool {
|
||||
index < self.size()
|
||||
}
|
||||
|
||||
fn get(&self, _index: i64) -> Option<&dyn ValueView> {
|
||||
Some(DummyField.as_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for DummyDoc {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
1000
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn contains_key(&self, _index: &str) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, _index: &str) -> Option<&'s dyn ValueView> {
|
||||
// Recursively sends itself
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyDoc {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Nil
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for TemplateChecker {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(DummyDoc.as_value()).chain(std::iter::once(DummyFields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(DummyDoc.as_value()),
|
||||
"fields" => Some(DummyFields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for TemplateChecker {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
|
@ -10,6 +10,7 @@ pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FAC
|
|||
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
|
||||
use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::index::MatchingStrategy;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{
|
||||
|
@ -364,6 +365,16 @@ impl Default for TermsMatchingStrategy {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<MatchingStrategy> for TermsMatchingStrategy {
|
||||
fn from(other: MatchingStrategy) -> Self {
|
||||
match other {
|
||||
MatchingStrategy::Last => Self::Last,
|
||||
MatchingStrategy::All => Self::All,
|
||||
MatchingStrategy::Frequency => Self::Frequency,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_first(s: &str) -> &str {
|
||||
match s.chars().next() {
|
||||
Some(c) => &s[..c.len_utf8()],
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
use std::collections::VecDeque;
|
||||
|
||||
use heed::types::{Bytes, Unit};
|
||||
use heed::{RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use super::facet_string_values;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
|
@ -16,12 +15,6 @@ use crate::search::new::{limits, SearchContext};
|
|||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
One,
|
||||
Two,
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
|
@ -45,7 +38,7 @@ impl Interned<QueryTerm> {
|
|||
fn find_zero_typo_prefix_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
prefix_of: &mut BTreeSet<Interned<String>>,
|
||||
) -> Result<()> {
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
@ -65,8 +58,8 @@ fn find_zero_typo_prefix_derivations(
|
|||
let derived_word = derived_word.to_string();
|
||||
let derived_word_interned = ctx.word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
prefix_of.insert(derived_word_interned);
|
||||
if prefix_of.len() >= limits::MAX_PREFIX_COUNT {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -81,7 +74,7 @@ fn find_one_typo_derivations(
|
|||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
one_typo_words: &mut BTreeSet<Interned<String>>,
|
||||
) -> Result<()> {
|
||||
let fst = ctx.get_words_fst()?;
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
|
@ -98,8 +91,8 @@ fn find_one_typo_derivations(
|
|||
1 => {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||
let cf = visit(derived_word)?;
|
||||
if cf.is_break() {
|
||||
one_typo_words.insert(derived_word);
|
||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -116,7 +109,8 @@ fn find_one_two_typo_derivations(
|
|||
is_prefix: bool,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
||||
one_typo_words: &mut BTreeSet<Interned<String>>,
|
||||
two_typo_words: &mut BTreeSet<Interned<String>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
@ -130,15 +124,20 @@ fn find_one_two_typo_derivations(
|
|||
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let finished_one_typo_words = one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT;
|
||||
let finished_two_typo_words = two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT;
|
||||
if finished_one_typo_words && finished_two_typo_words {
|
||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||
break;
|
||||
}
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
// No need to intern here
|
||||
// in the case the typo is on the first letter, we know the number of typo
|
||||
// is two
|
||||
if get_first(derived_word) != get_first(word) {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
if get_first(derived_word) != get_first(word) && !finished_two_typo_words {
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
two_typo_words.insert(derived_word_interned);
|
||||
continue;
|
||||
} else {
|
||||
// Else, we know that it is the second dfa that matched and compute the
|
||||
// correct distance
|
||||
|
@ -146,16 +145,18 @@ fn find_one_two_typo_derivations(
|
|||
match d.to_u8() {
|
||||
0 => (),
|
||||
1 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
if finished_one_typo_words {
|
||||
continue;
|
||||
}
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
one_typo_words.insert(derived_word_interned);
|
||||
}
|
||||
2 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
if finished_two_typo_words {
|
||||
continue;
|
||||
}
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
two_typo_words.insert(derived_word_interned);
|
||||
}
|
||||
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||
}
|
||||
|
@ -211,14 +212,7 @@ pub fn partially_initialized_term_from_word(
|
|||
}
|
||||
|
||||
if is_prefix && use_prefix_db.is_none() {
|
||||
find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
})?;
|
||||
find_zero_typo_prefix_derivations(ctx, word_interned, &mut prefix_of)?;
|
||||
}
|
||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
let mut synonym_word_count = 0;
|
||||
|
@ -281,14 +275,7 @@ impl Interned<QueryTerm> {
|
|||
let mut one_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
})?;
|
||||
find_one_typo_derivations(ctx, original, is_prefix, &mut one_typo_words)?;
|
||||
}
|
||||
|
||||
let split_words = if allows_split_words {
|
||||
|
@ -343,27 +330,8 @@ impl Interned<QueryTerm> {
|
|||
*is_prefix,
|
||||
ctx.index.words_fst(ctx.txn)?,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word, nbr_typos| {
|
||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||
{
|
||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
match nbr_typos {
|
||||
NumberOfTypos::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
NumberOfTypos::Two => {
|
||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||
two_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
&mut one_typo_words,
|
||||
&mut two_typo_words,
|
||||
)?;
|
||||
}
|
||||
|
||||
|
|
182
crates/milli/src/update/chat.rs
Normal file
182
crates/milli/src/update/chat.rs
Normal file
|
@ -0,0 +1,182 @@
|
|||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use deserr::errors::JsonError;
|
||||
use deserr::Deserr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use crate::index::{self, ChatConfig, MatchingStrategy, RankingScoreThreshold, SearchParameters};
|
||||
use crate::prompt::{default_max_bytes, PromptData};
|
||||
use crate::update::Setting;
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Deserr, ToSchema)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(error = JsonError, deny_unknown_fields, rename_all = camelCase)]
|
||||
pub struct ChatSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
pub description: Setting<String>,
|
||||
|
||||
/// A liquid template used to render documents to a text that can be embedded.
|
||||
///
|
||||
/// Meillisearch interpolates the template for each document and sends the resulting text to the embedder.
|
||||
/// The embedder then generates document vectors based on this text.
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
pub document_template: Setting<String>,
|
||||
|
||||
/// Rendered texts are truncated to this size. Defaults to 400.
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<usize>)]
|
||||
pub document_template_max_bytes: Setting<usize>,
|
||||
|
||||
/// The search parameters to use for the LLM.
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<ChatSearchParams>)]
|
||||
pub search_parameters: Setting<ChatSearchParams>,
|
||||
}
|
||||
|
||||
impl From<ChatConfig> for ChatSettings {
|
||||
fn from(config: ChatConfig) -> Self {
|
||||
let ChatConfig {
|
||||
description,
|
||||
prompt: PromptData { template, max_bytes },
|
||||
search_parameters,
|
||||
} = config;
|
||||
ChatSettings {
|
||||
description: Setting::Set(description),
|
||||
document_template: Setting::Set(template),
|
||||
document_template_max_bytes: Setting::Set(
|
||||
max_bytes.unwrap_or(default_max_bytes()).get(),
|
||||
),
|
||||
search_parameters: Setting::Set({
|
||||
let SearchParameters {
|
||||
hybrid,
|
||||
limit,
|
||||
sort,
|
||||
distinct,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
ranking_score_threshold,
|
||||
} = search_parameters;
|
||||
|
||||
let hybrid = hybrid.map(|index::HybridQuery { semantic_ratio, embedder }| {
|
||||
HybridQuery { semantic_ratio: SemanticRatio(semantic_ratio), embedder }
|
||||
});
|
||||
|
||||
ChatSearchParams {
|
||||
hybrid: Setting::some_or_not_set(hybrid),
|
||||
limit: Setting::some_or_not_set(limit),
|
||||
sort: Setting::some_or_not_set(sort),
|
||||
distinct: Setting::some_or_not_set(distinct),
|
||||
matching_strategy: Setting::some_or_not_set(matching_strategy),
|
||||
attributes_to_search_on: Setting::some_or_not_set(attributes_to_search_on),
|
||||
ranking_score_threshold: Setting::some_or_not_set(ranking_score_threshold),
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Deserr, ToSchema)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(error = JsonError, deny_unknown_fields, rename_all = camelCase)]
|
||||
pub struct ChatSearchParams {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<HybridQuery>)]
|
||||
pub hybrid: Setting<HybridQuery>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default = Setting::Set(20))]
|
||||
#[schema(value_type = Option<usize>)]
|
||||
pub limit: Setting<usize>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<Vec<String>>)]
|
||||
pub sort: Setting<Vec<String>>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
pub distinct: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<MatchingStrategy>)]
|
||||
pub matching_strategy: Setting<MatchingStrategy>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<Vec<String>>)]
|
||||
pub attributes_to_search_on: Setting<Vec<String>>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<RankingScoreThreshold>)]
|
||||
pub ranking_score_threshold: Setting<RankingScoreThreshold>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Deserr, ToSchema, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[deserr(error = JsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct HybridQuery {
|
||||
#[deserr(default)]
|
||||
#[serde(default)]
|
||||
#[schema(default, value_type = f32)]
|
||||
pub semantic_ratio: SemanticRatio,
|
||||
#[schema(value_type = String)]
|
||||
pub embedder: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Deserr, ToSchema, PartialEq, Serialize, Deserialize)]
|
||||
#[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
|
||||
pub struct SemanticRatio(f32);
|
||||
|
||||
impl Default for SemanticRatio {
|
||||
fn default() -> Self {
|
||||
SemanticRatio(0.5)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::convert::TryFrom<f32> for SemanticRatio {
|
||||
type Error = InvalidSearchSemanticRatio;
|
||||
|
||||
fn try_from(f: f32) -> Result<Self, Self::Error> {
|
||||
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if f > 1.0 || f < 0.0 {
|
||||
Err(InvalidSearchSemanticRatio)
|
||||
} else {
|
||||
Ok(SemanticRatio(f))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidSearchSemanticRatio;
|
||||
|
||||
impl Error for InvalidSearchSemanticRatio {}
|
||||
|
||||
impl fmt::Display for InvalidSearchSemanticRatio {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`."
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for SemanticRatio {
|
||||
type Target = f32;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
|
@ -1562,12 +1562,12 @@ mod tests {
|
|||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// Only the first document should match.
|
||||
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
|
||||
let count = index.word_docids.get(&rtxn, "huàzhuāng").unwrap().unwrap().len();
|
||||
assert_eq!(count, 1);
|
||||
|
||||
// Only the second document should match.
|
||||
let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
|
||||
assert_eq!(count, 1);
|
||||
assert_eq!(count, 2);
|
||||
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.query("化妆包");
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
pub use self::available_ids::AvailableIds;
|
||||
pub use self::chat::ChatSettings;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
|
@ -13,6 +14,7 @@ pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
|
|||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_ids;
|
||||
mod chat;
|
||||
mod clear_documents;
|
||||
mod concurrent_available_ids;
|
||||
pub(crate) mod del_add;
|
||||
|
|
|
@ -4,6 +4,7 @@ use serde_json::Value;
|
|||
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::fields_ids_map::metadata::Metadata;
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::geo::extract_geo_coordinates;
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
|
@ -11,8 +12,6 @@ use crate::{
|
|||
FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError,
|
||||
};
|
||||
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn extract_document_facets<'doc>(
|
||||
document: impl Document<'doc>,
|
||||
|
|
|
@ -18,7 +18,8 @@ pub use vectors::EmbeddingExtractor;
|
|||
pub mod perm_json_p {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::{attribute_patterns::PatternMatch, Result};
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::Result;
|
||||
const SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// Returns `true` if the `selector` match the `key`.
|
||||
|
|
|
@ -131,7 +131,12 @@ fn compute_word_fst(
|
|||
}
|
||||
}
|
||||
|
||||
pub fn recompute_word_fst_from_word_docids_database(index: &Index, wtxn: &mut RwTxn) -> Result<()> {
|
||||
pub fn recompute_word_fst_from_word_docids_database(
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
progress: &Progress,
|
||||
) -> Result<()> {
|
||||
progress.update_progress(PostProcessingWords::WordFst);
|
||||
let fst = fst::Set::default().map_data(std::borrow::Cow::Owned)?;
|
||||
let mut word_fst_builder = WordFstBuilder::new(&fst)?;
|
||||
let words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>();
|
||||
|
|
|
@ -11,22 +11,23 @@ use roaring::RoaringBitmap;
|
|||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::chat::ChatSearchParams;
|
||||
use super::del_add::{DelAdd, DelAddOperation};
|
||||
use super::index_documents::{IndexDocumentsConfig, Transform};
|
||||
use super::IndexerConfig;
|
||||
use super::{ChatSettings, IndexerConfig};
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::disabled_typos_terms::DisabledTyposTerms;
|
||||
use crate::error::UserError;
|
||||
use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes};
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
use crate::index::{
|
||||
IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
|
||||
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters,
|
||||
DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::prompt::default_max_bytes;
|
||||
use crate::prompt::{default_max_bytes, default_template_text, PromptData};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
|
@ -185,6 +186,7 @@ pub struct Settings<'a, 't, 'i> {
|
|||
localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>,
|
||||
prefix_search: Setting<PrefixSearch>,
|
||||
facet_search: Setting<bool>,
|
||||
chat: Setting<ChatSettings>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
@ -223,6 +225,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
localized_attributes_rules: Setting::NotSet,
|
||||
prefix_search: Setting::NotSet,
|
||||
facet_search: Setting::NotSet,
|
||||
chat: Setting::NotSet,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
@ -453,6 +456,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.facet_search = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_chat(&mut self, value: ChatSettings) {
|
||||
self.chat = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_chat(&mut self) {
|
||||
self.chat = Setting::Reset;
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, settings_diff),
|
||||
|
@ -884,7 +895,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
disabled_typos_terms.disable_on_numbers = disable_on_numbers;
|
||||
}
|
||||
Setting::Reset => {
|
||||
self.index.delete_disabled_typos_terms(self.wtxn)?;
|
||||
disabled_typos_terms.disable_on_numbers =
|
||||
DisabledTyposTerms::default().disable_on_numbers;
|
||||
}
|
||||
|
@ -1239,6 +1249,112 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn update_chat_config(&mut self) -> Result<bool> {
|
||||
match &mut self.chat {
|
||||
Setting::Set(ChatSettings {
|
||||
description: new_description,
|
||||
document_template: new_document_template,
|
||||
document_template_max_bytes: new_document_template_max_bytes,
|
||||
search_parameters: new_search_parameters,
|
||||
}) => {
|
||||
let ChatConfig { description, prompt, search_parameters } =
|
||||
self.index.chat_config(self.wtxn)?;
|
||||
|
||||
let description = match new_description {
|
||||
Setting::Set(new) => new.clone(),
|
||||
Setting::Reset => Default::default(),
|
||||
Setting::NotSet => description,
|
||||
};
|
||||
|
||||
let prompt = PromptData {
|
||||
template: match new_document_template {
|
||||
Setting::Set(new) => new.clone(),
|
||||
Setting::Reset => default_template_text().to_string(),
|
||||
Setting::NotSet => prompt.template.clone(),
|
||||
},
|
||||
max_bytes: match new_document_template_max_bytes {
|
||||
Setting::Set(m) => Some(
|
||||
NonZeroUsize::new(*m)
|
||||
.ok_or(InvalidChatSettingsDocumentTemplateMaxBytes)?,
|
||||
),
|
||||
Setting::Reset => Some(default_max_bytes()),
|
||||
Setting::NotSet => prompt.max_bytes,
|
||||
},
|
||||
};
|
||||
|
||||
let search_parameters = match new_search_parameters {
|
||||
Setting::Set(sp) => {
|
||||
let ChatSearchParams {
|
||||
hybrid,
|
||||
limit,
|
||||
sort,
|
||||
distinct,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
ranking_score_threshold,
|
||||
} = sp;
|
||||
|
||||
SearchParameters {
|
||||
hybrid: match hybrid {
|
||||
Setting::Set(hybrid) => Some(crate::index::HybridQuery {
|
||||
semantic_ratio: *hybrid.semantic_ratio,
|
||||
embedder: hybrid.embedder.clone(),
|
||||
}),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.hybrid.clone(),
|
||||
},
|
||||
limit: match limit {
|
||||
Setting::Set(limit) => Some(*limit),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.limit,
|
||||
},
|
||||
sort: match sort {
|
||||
Setting::Set(sort) => Some(sort.clone()),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.sort.clone(),
|
||||
},
|
||||
distinct: match distinct {
|
||||
Setting::Set(distinct) => Some(distinct.clone()),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.distinct.clone(),
|
||||
},
|
||||
matching_strategy: match matching_strategy {
|
||||
Setting::Set(matching_strategy) => Some(*matching_strategy),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.matching_strategy,
|
||||
},
|
||||
attributes_to_search_on: match attributes_to_search_on {
|
||||
Setting::Set(attributes_to_search_on) => {
|
||||
Some(attributes_to_search_on.clone())
|
||||
}
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => {
|
||||
search_parameters.attributes_to_search_on.clone()
|
||||
}
|
||||
},
|
||||
ranking_score_threshold: match ranking_score_threshold {
|
||||
Setting::Set(rst) => Some(*rst),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => search_parameters.ranking_score_threshold,
|
||||
},
|
||||
}
|
||||
}
|
||||
Setting::Reset => Default::default(),
|
||||
Setting::NotSet => search_parameters,
|
||||
};
|
||||
|
||||
self.index.put_chat_config(
|
||||
self.wtxn,
|
||||
&ChatConfig { description, prompt, search_parameters },
|
||||
)?;
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
Setting::Reset => self.index.delete_chat_config(self.wtxn).map_err(Into::into),
|
||||
Setting::NotSet => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
|
@ -1276,6 +1392,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.update_facet_search()?;
|
||||
self.update_localized_attributes_rules()?;
|
||||
self.update_disabled_typos_terms()?;
|
||||
self.update_chat_config()?;
|
||||
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
|
||||
|
|
|
@ -897,6 +897,7 @@ fn test_correct_settings_init() {
|
|||
prefix_search,
|
||||
facet_search,
|
||||
disable_on_numbers,
|
||||
chat,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -925,6 +926,7 @@ fn test_correct_settings_init() {
|
|||
assert!(matches!(prefix_search, Setting::NotSet));
|
||||
assert!(matches!(facet_search, Setting::NotSet));
|
||||
assert!(matches!(disable_on_numbers, Setting::NotSet));
|
||||
assert!(matches!(chat, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ use heed::RwTxn;
|
|||
use super::UpgradeIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::recompute_word_fst_from_word_docids_database;
|
||||
use crate::{make_enum_progress, Index, Result};
|
||||
use crate::{Index, Result};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct Latest_V1_14_To_Latest_V1_15();
|
||||
|
@ -17,14 +17,7 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
|
|||
progress: Progress,
|
||||
) -> Result<bool> {
|
||||
// Recompute the word FST from the word docids database.
|
||||
make_enum_progress! {
|
||||
enum TypoTolerance {
|
||||
RecomputeWordFst,
|
||||
}
|
||||
};
|
||||
|
||||
progress.update_progress(TypoTolerance::RecomputeWordFst);
|
||||
recompute_word_fst_from_word_docids_database(index, wtxn)?;
|
||||
recompute_word_fst_from_word_docids_database(index, wtxn, &progress)?;
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - Defaults to `openAi`
|
||||
pub source: Setting<EmbedderSource>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
|
@ -55,6 +56,7 @@ pub struct EmbeddingSettings {
|
|||
/// - For source `openAi`, defaults to `text-embedding-3-small`
|
||||
/// - For source `huggingFace`, defaults to `BAAI/bge-base-en-v1.5`
|
||||
pub model: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
|
@ -75,6 +77,7 @@ pub struct EmbeddingSettings {
|
|||
/// - When `model` is set to default, defaults to `617ca489d9e86b49b8167676d8220688b99db36e`
|
||||
/// - Otherwise, defaults to `null`
|
||||
pub revision: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<OverridePooling>)]
|
||||
|
@ -96,6 +99,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - Embedders created before this parameter was available default to `forceMean` to preserve the existing behavior.
|
||||
pub pooling: Setting<OverridePooling>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
|
@ -118,6 +122,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - This setting is partially hidden when returned by the settings
|
||||
pub api_key: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
|
@ -141,6 +146,7 @@ pub struct EmbeddingSettings {
|
|||
/// - For source `openAi`, the dimensions is the maximum allowed by the model.
|
||||
/// - For sources `ollama` and `rest`, the dimensions are inferred by embedding a sample text.
|
||||
pub dimensions: Setting<usize>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<bool>)]
|
||||
|
@ -167,6 +173,7 @@ pub struct EmbeddingSettings {
|
|||
/// first enabling it. If you are unsure of whether the performance-relevancy tradeoff is right for you,
|
||||
/// we recommend to use this parameter on a test index first.
|
||||
pub binary_quantized: Setting<bool>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<bool>)]
|
||||
|
@ -183,6 +190,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - 🏗️ When modified, embeddings are regenerated for documents whose rendering through the template produces a different text.
|
||||
pub document_template: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<usize>)]
|
||||
|
@ -201,6 +209,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - Defaults to 400
|
||||
pub document_template_max_bytes: Setting<usize>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<String>)]
|
||||
|
@ -219,6 +228,7 @@ pub struct EmbeddingSettings {
|
|||
/// - 🌱 When modified for source `openAi`, embeddings are never regenerated
|
||||
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
|
||||
pub url: Setting<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<serde_json::Value>)]
|
||||
|
@ -236,6 +246,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
|
||||
pub request: Setting<serde_json::Value>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<serde_json::Value>)]
|
||||
|
@ -253,6 +264,7 @@ pub struct EmbeddingSettings {
|
|||
///
|
||||
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
|
||||
pub response: Setting<serde_json::Value>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeMap<String, String>>)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue