From 928e6e4c059718300231fda19ad1b6bd041ab477 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 27 Mar 2024 15:36:49 +0100 Subject: [PATCH 01/13] Breaking change: remove vector for score details --- meilisearch/src/search.rs | 6 ++---- milli/src/score_details.rs | 16 ++++++---------- milli/src/search/new/vector_sort.rs | 28 +++++++--------------------- 3 files changed, 15 insertions(+), 35 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index db58c6102..2e0df18ad 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -633,10 +633,8 @@ pub fn perform_search( let mut semantic_score = None; for details in &score { - if let ScoreDetails::Vector(score_details::Vector { - target_vector: _, - value_similarity: Some((_matching_vector, similarity)), - }) = details + if let ScoreDetails::Vector(score_details::Vector { similarity: Some(similarity) }) = + details { semantic_score = Some(*similarity); break; diff --git a/milli/src/score_details.rs b/milli/src/score_details.rs index 08dfcdbb6..0a9b77e2b 100644 --- a/milli/src/score_details.rs +++ b/milli/src/score_details.rs @@ -98,9 +98,9 @@ impl ScoreDetails { ScoreDetails::ExactWords(e) => RankOrValue::Rank(e.rank()), ScoreDetails::Sort(sort) => RankOrValue::Sort(sort), ScoreDetails::GeoSort(geosort) => RankOrValue::GeoSort(geosort), - ScoreDetails::Vector(vector) => RankOrValue::Score( - vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64), - ), + ScoreDetails::Vector(vector) => { + RankOrValue::Score(vector.similarity.as_ref().map(|s| *s as f64).unwrap_or(0.0f64)) + } ScoreDetails::Skipped => RankOrValue::Rank(Rank { rank: 0, max_rank: 1 }), } } @@ -249,16 +249,13 @@ impl ScoreDetails { order += 1; } ScoreDetails::Vector(s) => { - let vector = format!("vectorSort({:?})", s.target_vector); - let value = s.value_similarity.as_ref().map(|(v, _)| v); - let similarity = s.value_similarity.as_ref().map(|(_, s)| s); + let similarity = s.similarity.as_ref(); let details = serde_json::json!({ "order": order, - "value": value, "similarity": similarity, }); - details_map.insert(vector, details); + details_map.insert("vectorSort".into(), details); order += 1; } ScoreDetails::Skipped => { @@ -494,8 +491,7 @@ impl PartialOrd for GeoSort { #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Vector { - pub target_vector: Vec, - pub value_similarity: Option<(Vec, f32)>, + pub similarity: Option, } impl GeoSort { diff --git a/milli/src/search/new/vector_sort.rs b/milli/src/search/new/vector_sort.rs index b29a72827..476477218 100644 --- a/milli/src/search/new/vector_sort.rs +++ b/milli/src/search/new/vector_sort.rs @@ -12,7 +12,7 @@ pub struct VectorSort { query: Option, target: Vec, vector_candidates: RoaringBitmap, - cached_sorted_docids: std::vec::IntoIter<(DocumentId, f32, Vec)>, + cached_sorted_docids: std::vec::IntoIter<(DocumentId, f32)>, limit: usize, distribution_shift: Option, embedder_index: u8, @@ -70,14 +70,9 @@ impl VectorSort { for reader in readers.iter() { let nns_by_vector = reader.nns_by_vector(ctx.txn, target, self.limit, None, Some(vector_candidates))?; - let vectors: std::result::Result, _> = nns_by_vector - .iter() - .map(|(docid, _)| reader.item_vector(ctx.txn, *docid).transpose().unwrap()) - .collect(); - let vectors = vectors?; - results.extend(nns_by_vector.into_iter().zip(vectors).map(|((x, y), z)| (x, y, z))); + results.extend(nns_by_vector.into_iter()); } - results.sort_unstable_by_key(|(_, distance, _)| OrderedFloat(*distance)); + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); self.cached_sorted_docids = results.into_iter(); Ok(()) @@ -118,14 +113,11 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort { return Ok(Some(RankingRuleOutput { query, candidates: universe.clone(), - score: ScoreDetails::Vector(score_details::Vector { - target_vector: self.target.clone(), - value_similarity: None, - }), + score: ScoreDetails::Vector(score_details::Vector { similarity: None }), })); } - for (docid, distance, vector) in self.cached_sorted_docids.by_ref() { + for (docid, distance) in self.cached_sorted_docids.by_ref() { if vector_candidates.contains(docid) { let score = 1.0 - distance; let score = self @@ -135,10 +127,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort { return Ok(Some(RankingRuleOutput { query, candidates: RoaringBitmap::from_iter([docid]), - score: ScoreDetails::Vector(score_details::Vector { - target_vector: self.target.clone(), - value_similarity: Some((vector, score)), - }), + score: ScoreDetails::Vector(score_details::Vector { similarity: Some(score) }), })); } } @@ -154,10 +143,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort { return Ok(Some(RankingRuleOutput { query, candidates: universe.clone(), - score: ScoreDetails::Vector(score_details::Vector { - target_vector: self.target.clone(), - value_similarity: None, - }), + score: ScoreDetails::Vector(score_details::Vector { similarity: None }), })); } From 190933f6e1986571056b5147a5189c8d53dcb972 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 27 Mar 2024 15:40:02 +0100 Subject: [PATCH 02/13] Breaking: Remove vector from SearchResult --- meilisearch/src/analytics/segment_analytics.rs | 1 - meilisearch/src/search.rs | 3 --- 2 files changed, 4 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 4a0ef5b35..fcf4d9144 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -758,7 +758,6 @@ impl SearchAggregator { let SearchResult { hits: _, query: _, - vector: _, processing_time_ms, hits_info: _, facet_distribution: _, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 2e0df18ad..a1aa37779 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -314,8 +314,6 @@ pub struct SearchHit { pub struct SearchResult { pub hits: Vec, pub query: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub vector: Option>, pub processing_time_ms: u128, #[serde(flatten)] pub hits_info: HitsInfo, @@ -713,7 +711,6 @@ pub fn perform_search( hits: documents, hits_info, query: query.q.unwrap_or_default(), - vector: query.vector, processing_time_ms: before_search.elapsed().as_millis(), facet_distribution, facet_stats, From 00c4ed3bc25455d81695680d164db58e7c310e89 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 11:49:00 +0100 Subject: [PATCH 03/13] milli: refactor getting embedder and embedder name --- milli/src/vector/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 1cb0a18f7..5aa58da5d 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -143,7 +143,7 @@ impl EmbeddingConfigs { /// Get the default embedder configuration, if any. pub fn get_default(&self) -> Option<(Arc, Arc)> { - self.get_default_embedder_name().and_then(|default| self.get(&default)) + self.get(self.get_default_embedder_name()) } /// Get the name of the default embedder configuration. @@ -153,14 +153,14 @@ impl EmbeddingConfigs { /// - If there is only one embedder, it is always the default. /// - If there are multiple embedders and one of them is called `default`, then that one is the default embedder. /// - In all other cases, there is no default embedder. - pub fn get_default_embedder_name(&self) -> Option { + pub fn get_default_embedder_name(&self) -> &str { let mut it = self.0.keys(); let first_name = it.next(); let second_name = it.next(); match (first_name, second_name) { - (None, _) => None, - (Some(first), None) => Some(first.to_owned()), - (Some(_), Some(_)) => Some("default".to_owned()), + (None, _) => "default", + (Some(first), None) => first, + (Some(_), Some(_)) => "default", } } } From fabc9cf14af3efe900c4c62d9f936422db830be0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 11:49:23 +0100 Subject: [PATCH 04/13] milli: add Embedder::embed_one --- milli/src/vector/error.rs | 7 ++++++- milli/src/vector/mod.rs | 11 +++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 1e0bcc7fb..d3369ef3d 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -58,7 +58,7 @@ pub enum EmbedErrorKind { RestResponseDeserialization(std::io::Error), #[error("component `{0}` not found in path `{1}` in response: `{2}`")] RestResponseMissingEmbeddings(String, String, String), - #[error("expected a response parseable as a vector or an array of vectors: {0}")] + #[error("unexpected format of the embedding response: {0}")] RestResponseFormat(serde_json::Error), #[error("expected a response containing {0} embeddings, got only {1}")] RestResponseEmbeddingCount(usize, usize), @@ -78,6 +78,8 @@ pub enum EmbedErrorKind { RestNotAnObject(serde_json::Value, Vec), #[error("while embedding tokenized, was expecting embeddings of dimension `{0}`, got embeddings of dimensions `{1}`")] OpenAiUnexpectedDimension(usize, usize), + #[error("no embedding was produced")] + MissingEmbedding, } impl EmbedError { @@ -190,6 +192,9 @@ impl EmbedError { fault: FaultSource::Runtime, } } + pub(crate) fn missing_embedding() -> EmbedError { + Self { kind: EmbedErrorKind::MissingEmbedding, fault: FaultSource::Undecided } + } } #[derive(Debug, thiserror::Error)] diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 5aa58da5d..58f7ba5e1 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -237,6 +237,17 @@ impl Embedder { } } + pub fn embed_one(&self, text: String) -> std::result::Result { + let mut embeddings = self.embed(vec![text])?; + let embeddings = embeddings.pop().ok_or_else(EmbedError::missing_embedding)?; + Ok(if embeddings.iter().nth(1).is_some() { + tracing::warn!("Ignoring embeddings past the first one in long search query"); + embeddings.iter().next().unwrap().to_vec() + } else { + embeddings.into_inner() + }) + } + /// Embed multiple chunks of texts. /// /// Each chunk is composed of one or multiple texts. From 6ebb6b55a64b266bfad68b8a76ee5ce00435b2d0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 11:50:53 +0100 Subject: [PATCH 05/13] Lazily embed, don't fail hybrid search on embedding failure --- .../src/routes/indexes/facet_search.rs | 4 +- meilisearch/src/routes/indexes/search.rs | 119 ++++++--------- meilisearch/src/routes/multi_search.rs | 8 +- meilisearch/src/search.rs | 141 +++++++++++++----- milli/src/index.rs | 8 - milli/src/lib.rs | 2 +- milli/src/search/facet/search.rs | 12 +- milli/src/search/hybrid.rs | 37 +++-- milli/src/search/mod.rs | 93 +++++------- milli/src/search/new/mod.rs | 10 +- milli/src/search/new/vector_sort.rs | 6 +- 11 files changed, 237 insertions(+), 203 deletions(-) diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 272b8156f..56880a472 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -12,6 +12,7 @@ use tracing::debug; use crate::analytics::{Analytics, FacetSearchAggregator}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; +use crate::routes::indexes::search::search_kind; use crate::search::{ add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, @@ -73,9 +74,10 @@ pub async fn search( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); + let search_kind = search_kind(&search_query, &index_scheduler, &index)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_facet_search(&index, search_query, facet_query, facet_name, features) + perform_facet_search(&index, search_query, facet_query, facet_name, features, search_kind) }) .await?; diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index f16a6c4df..a5fe3c5d6 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -8,19 +8,19 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli; -use meilisearch_types::milli::vector::DistributionShift; use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; -use tracing::{debug, warn}; +use tracing::debug; use crate::analytics::{Analytics, SearchAggregator}; +use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ - add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchQuery, SemanticRatio, - DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, + add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchKind, SearchQuery, + SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -204,11 +204,11 @@ pub async fn search_with_url_query( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let distribution = embed(&mut query, index_scheduler.get_ref(), &index)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), &index)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution)) + tokio::task::spawn_blocking(move || perform_search(&index, query, features, search_kind)) .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); @@ -245,11 +245,11 @@ pub async fn search_with_post( let features = index_scheduler.features(); - let distribution = embed(&mut query, index_scheduler.get_ref(), &index)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), &index)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution)) + tokio::task::spawn_blocking(move || perform_search(&index, query, features, search_kind)) .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); @@ -265,76 +265,49 @@ pub async fn search_with_post( Ok(HttpResponse::Ok().json(search_result)) } -pub fn embed( - query: &mut SearchQuery, +pub fn search_kind( + query: &SearchQuery, index_scheduler: &IndexScheduler, index: &milli::Index, -) -> Result, ResponseError> { - match (&query.hybrid, &query.vector, &query.q) { - (Some(HybridQuery { semantic_ratio: _, embedder }), None, Some(q)) - if !q.trim().is_empty() => - { - let embedder_configs = index.embedding_configs(&index.read_txn()?)?; - let embedders = index_scheduler.embedders(embedder_configs)?; - - let embedder = if let Some(embedder_name) = embedder { - embedders.get(embedder_name) - } else { - embedders.get_default() - }; - - let embedder = embedder - .ok_or(milli::UserError::InvalidEmbedder("default".to_owned())) - .map_err(milli::Error::from)? - .0; - - let distribution = embedder.distribution(); - - let embeddings = embedder - .embed(vec![q.to_owned()]) - .map_err(milli::vector::Error::from) - .map_err(milli::Error::from)? - .pop() - .expect("No vector returned from embedding"); - - if embeddings.iter().nth(1).is_some() { - warn!("Ignoring embeddings past the first one in long search query"); - query.vector = Some(embeddings.iter().next().unwrap().to_vec()); - } else { - query.vector = Some(embeddings.into_inner()); - } - Ok(distribution) +) -> Result { + // regardless of anything, always do a semantic search when we don't have a vector and the query is whitespace or missing + if query.vector.is_none() { + match &query.q { + Some(q) if q.trim().is_empty() => return Ok(SearchKind::KeywordOnly), + None => return Ok(SearchKind::KeywordOnly), + _ => {} } - (Some(hybrid), vector, _) => { - let embedder_configs = index.embedding_configs(&index.read_txn()?)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + } - let embedder = if let Some(embedder_name) = &hybrid.embedder { - embedders.get(embedder_name) - } else { - embedders.get_default() - }; - - let embedder = embedder - .ok_or(milli::UserError::InvalidEmbedder("default".to_owned())) - .map_err(milli::Error::from)? - .0; - - if let Some(vector) = vector { - if vector.len() != embedder.dimensions() { - return Err(meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorDimensions { - expected: embedder.dimensions(), - found: vector.len(), - }, - ) - .into()); - } - } - - Ok(embedder.distribution()) + match &query.hybrid { + Some(HybridQuery { semantic_ratio, embedder }) if **semantic_ratio == 1.0 => { + Ok(SearchKind::semantic( + index_scheduler, + index, + embedder.as_deref(), + query.vector.as_ref().map(Vec::len), + )?) } - _ => Ok(None), + Some(HybridQuery { semantic_ratio, embedder: _ }) if **semantic_ratio == 0.0 => { + Ok(SearchKind::KeywordOnly) + } + Some(HybridQuery { semantic_ratio, embedder }) => Ok(SearchKind::hybrid( + index_scheduler, + index, + embedder.as_deref(), + **semantic_ratio, + query.vector.as_ref().map(Vec::len), + )?), + None => match (query.q.as_deref(), query.vector.as_deref()) { + (_query, None) => Ok(SearchKind::KeywordOnly), + (None, Some(_vector)) => Ok(SearchKind::semantic( + index_scheduler, + index, + None, + query.vector.as_ref().map(Vec::len), + )?), + (Some(_), Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + }, } } diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index b2055fb07..04cd3f637 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -13,7 +13,7 @@ use crate::analytics::{Analytics, MultiSearchAggregator}; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::indexes::search::embed; +use crate::routes::indexes::search::search_kind; use crate::search::{ add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, }; @@ -81,11 +81,11 @@ pub async fn multi_search_with_post( }) .with_index(query_index)?; - let distribution = - embed(&mut query, index_scheduler.get_ref(), &index).with_index(query_index)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), &index).with_index(query_index)?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, features, distribution) + perform_search(&index, query, features, search_kind) }) .await .with_index(query_index)?; diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a1aa37779..2a22cb2ce 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1,6 +1,7 @@ use std::cmp::min; use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::str::FromStr; +use std::sync::Arc; use std::time::{Duration, Instant}; use deserr::Deserr; @@ -10,10 +11,11 @@ use indexmap::IndexMap; use meilisearch_auth::IndexSearchRules; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::deserr_codes::*; +use meilisearch_types::error::ResponseError; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{self, ScoreDetails, ScoringStrategy}; -use meilisearch_types::milli::vector::DistributionShift; +use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; @@ -90,13 +92,75 @@ pub struct SearchQuery { #[derive(Debug, Clone, Default, PartialEq, Deserr)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct HybridQuery { - /// TODO validate that sementic ratio is between 0.0 and 1,0 #[deserr(default, error = DeserrJsonError, default)] pub semantic_ratio: SemanticRatio, #[deserr(default, error = DeserrJsonError, default)] pub embedder: Option, } +pub enum SearchKind { + KeywordOnly, + SemanticOnly { embedder_name: String, embedder: Arc }, + Hybrid { embedder_name: String, embedder: Arc, semantic_ratio: f32 }, +} +impl SearchKind { + pub(crate) fn semantic( + index_scheduler: &index_scheduler::IndexScheduler, + index: &Index, + embedder_name: Option<&str>, + vector_len: Option, + ) -> Result { + let (embedder_name, embedder) = + Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Ok(Self::SemanticOnly { embedder_name, embedder }) + } + + pub(crate) fn hybrid( + index_scheduler: &index_scheduler::IndexScheduler, + index: &Index, + embedder_name: Option<&str>, + semantic_ratio: f32, + vector_len: Option, + ) -> Result { + let (embedder_name, embedder) = + Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Ok(Self::Hybrid { embedder_name, embedder, semantic_ratio }) + } + + fn embedder( + index_scheduler: &index_scheduler::IndexScheduler, + index: &Index, + embedder_name: Option<&str>, + vector_len: Option, + ) -> Result<(String, Arc), ResponseError> { + let embedder_configs = index.embedding_configs(&index.read_txn()?)?; + let embedders = index_scheduler.embedders(embedder_configs)?; + + let embedder_name = embedder_name.unwrap_or_else(|| embedders.get_default_embedder_name()); + + let embedder = embedders.get(embedder_name); + + let embedder = embedder + .ok_or(milli::UserError::InvalidEmbedder(embedder_name.to_owned())) + .map_err(milli::Error::from)? + .0; + + if let Some(vector_len) = vector_len { + if vector_len != embedder.dimensions() { + return Err(meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorDimensions { + expected: embedder.dimensions(), + found: vector_len, + }, + ) + .into()); + } + } + + Ok((embedder_name.to_owned(), embedder)) + } +} + #[derive(Debug, Clone, Copy, PartialEq, Deserr)] #[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)] pub struct SemanticRatio(f32); @@ -385,7 +449,7 @@ fn prepare_search<'t>( rtxn: &'t RoTxn, query: &'t SearchQuery, features: RoFeatures, - distribution: Option, + search_kind: &SearchKind, time_budget: TimeBudget, ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); @@ -399,32 +463,30 @@ fn prepare_search<'t>( features.check_vector("Passing `hybrid` as a query parameter")?; } - if query.hybrid.is_none() && query.q.is_some() && query.vector.is_some() { - return Err(MeilisearchHttpError::MissingSearchHybrid); - } - - search.distribution_shift(distribution); - - if let Some(ref vector) = query.vector { - match &query.hybrid { - // If semantic ratio is 0.0, only the query search will impact the search results, - // skip the vector - Some(hybrid) if *hybrid.semantic_ratio == 0.0 => (), - _otherwise => { - search.vector(vector.clone()); - } - } - } - - if let Some(ref q) = query.q { - match &query.hybrid { - // If semantic ratio is 1.0, only the vector search will impact the search results, - // skip the query - Some(hybrid) if *hybrid.semantic_ratio == 1.0 => (), - _otherwise => { + match search_kind { + SearchKind::KeywordOnly => { + if let Some(q) = &query.q { search.query(q); } } + SearchKind::SemanticOnly { embedder_name, embedder } => { + let vector = match query.vector.clone() { + Some(vector) => vector, + None => embedder + .embed_one(query.q.clone().unwrap()) + .map_err(milli::vector::Error::from) + .map_err(milli::Error::from)?, + }; + + search.semantic(embedder_name.clone(), embedder.clone(), Some(vector)); + } + SearchKind::Hybrid { embedder_name, embedder, semantic_ratio: _ } => { + if let Some(q) = &query.q { + search.query(q); + } + // will be embedded in hybrid search if necessary + search.semantic(embedder_name.clone(), embedder.clone(), query.vector.clone()); + } } if let Some(ref searchable) = query.attributes_to_search_on { @@ -447,10 +509,6 @@ fn prepare_search<'t>( ScoringStrategy::Skip }); - if let Some(HybridQuery { embedder: Some(embedder), .. }) = &query.hybrid { - search.embedder_name(embedder); - } - // compute the offset on the limit depending on the pagination mode. let (offset, limit) = if is_finite_pagination { let limit = query.hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); @@ -494,7 +552,7 @@ pub fn perform_search( index: &Index, query: SearchQuery, features: RoFeatures, - distribution: Option, + search_kind: SearchKind, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -504,7 +562,7 @@ pub fn perform_search( }; let (search, is_finite_pagination, max_total_hits, offset) = - prepare_search(index, &rtxn, &query, features, distribution, time_budget)?; + prepare_search(index, &rtxn, &query, features, &search_kind, time_budget)?; let milli::SearchResult { documents_ids, @@ -514,12 +572,9 @@ pub fn perform_search( degraded, used_negative_operator, .. - } = match &query.hybrid { - Some(hybrid) => match *hybrid.semantic_ratio { - ratio if ratio == 0.0 || ratio == 1.0 => search.execute()?, - ratio => search.execute_hybrid(ratio)?, - }, - None => search.execute()?, + } = match &search_kind { + SearchKind::KeywordOnly | SearchKind::SemanticOnly { .. } => search.execute()?, + SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, }; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -726,6 +781,7 @@ pub fn perform_facet_search( facet_query: Option, facet_name: String, features: RoFeatures, + search_kind: SearchKind, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -735,9 +791,12 @@ pub fn perform_facet_search( }; let (search, _, _, _) = - prepare_search(index, &rtxn, &search_query, features, None, time_budget)?; - let mut facet_search = - SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some()); + prepare_search(index, &rtxn, &search_query, features, &search_kind, time_budget)?; + let mut facet_search = SearchForFacetValues::new( + facet_name, + search, + matches!(search_kind, SearchKind::Hybrid { .. }), + ); if let Some(facet_query) = &facet_query { facet_search.query(facet_query); } diff --git a/milli/src/index.rs b/milli/src/index.rs index 80e524fb1..db31c953a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1499,14 +1499,6 @@ impl Index { .unwrap_or_default()) } - pub fn default_embedding_name(&self, rtxn: &RoTxn<'_>) -> Result { - let configs = self.embedding_configs(rtxn)?; - Ok(match configs.as_slice() { - [(ref first_name, _)] => first_name.clone(), - _ => "default".to_owned(), - }) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index df44ca127..22816787b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -61,7 +61,7 @@ pub use self::index::Index; pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, - Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 0251d6b8d..a6756a7af 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -92,9 +92,15 @@ impl<'a> SearchForFacetValues<'a> { None => return Ok(Vec::new()), }; - let search_candidates = self - .search_query - .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; + let search_candidates = self.search_query.execute_for_candidates( + self.is_hybrid + || self + .search_query + .semantic + .as_ref() + .and_then(|semantic| semantic.vector.as_ref()) + .is_some(), + )?; let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) { OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values), diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 2a6d9f7a5..e45652206 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -4,6 +4,7 @@ use itertools::Itertools; use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; +use crate::search::SemanticSearch; use crate::{MatchingWords, Result, Search, SearchResult}; struct ScoreWithRatioResult { @@ -126,7 +127,6 @@ impl<'a> Search<'a> { // create separate keyword and semantic searches let mut search = Search { query: self.query.clone(), - vector: self.vector.clone(), filter: self.filter.clone(), offset: 0, limit: self.limit + self.offset, @@ -139,26 +139,41 @@ impl<'a> Search<'a> { exhaustive_number_hits: self.exhaustive_number_hits, rtxn: self.rtxn, index: self.index, - distribution_shift: self.distribution_shift, - embedder_name: self.embedder_name.clone(), + semantic: self.semantic.clone(), time_budget: self.time_budget.clone(), }; - let vector_query = search.vector.take(); + let semantic = search.semantic.take(); let keyword_results = search.execute()?; - // skip semantic search if we don't have a vector query (placeholder search) - let Some(vector_query) = vector_query else { - return Ok(keyword_results); - }; - // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { return Ok(keyword_results); } - search.vector = Some(vector_query); - search.query = None; + // no vector search against placeholder search + let Some(query) = search.query.take() else { return Ok(keyword_results) }; + // no embedder, no semantic search + let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { + return Ok(keyword_results); + }; + + let vector_query = match vector { + Some(vector_query) => vector_query, + None => { + // attempt to embed the vector + match embedder.embed_one(query) { + Ok(embedding) => embedding, + Err(error) => { + tracing::error!(error=%error, "Embedding failed"); + return Ok(keyword_results); + } + } + } + }; + + search.semantic = + Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder }); // TODO: would be better to have two distinct functions at this point let vector_results = search.execute()?; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3c709a647..bab67e6bd 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::sync::Arc; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; @@ -8,7 +9,7 @@ pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FAC pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::vector::DistributionShift; +use crate::vector::Embedder; use crate::{ execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, TimeBudget, @@ -24,9 +25,15 @@ mod fst_utils; pub mod hybrid; pub mod new; +#[derive(Debug, Clone)] +pub struct SemanticSearch { + vector: Option>, + embedder_name: String, + embedder: Arc, +} + pub struct Search<'a> { query: Option, - vector: Option>, // this should be linked to the String in the query filter: Option>, offset: usize, @@ -38,12 +45,9 @@ pub struct Search<'a> { scoring_strategy: ScoringStrategy, words_limit: usize, exhaustive_number_hits: bool, - /// TODO: Add semantic ratio or pass it directly to execute_hybrid() rtxn: &'a heed::RoTxn<'a>, index: &'a Index, - distribution_shift: Option, - embedder_name: Option, - + semantic: Option, time_budget: TimeBudget, } @@ -51,7 +55,6 @@ impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { Search { query: None, - vector: None, filter: None, offset: 0, limit: 20, @@ -64,8 +67,7 @@ impl<'a> Search<'a> { words_limit: 10, rtxn, index, - distribution_shift: None, - embedder_name: None, + semantic: None, time_budget: TimeBudget::max(), } } @@ -75,8 +77,13 @@ impl<'a> Search<'a> { self } - pub fn vector(&mut self, vector: Vec) -> &mut Search<'a> { - self.vector = Some(vector); + pub fn semantic( + &mut self, + embedder_name: String, + embedder: Arc, + vector: Option>, + ) -> &mut Search<'a> { + self.semantic = Some(SemanticSearch { embedder_name, embedder, vector }); self } @@ -133,19 +140,6 @@ impl<'a> Search<'a> { self } - pub fn distribution_shift( - &mut self, - distribution_shift: Option, - ) -> &mut Search<'a> { - self.distribution_shift = distribution_shift; - self - } - - pub fn embedder_name(&mut self, embedder_name: impl Into) -> &mut Search<'a> { - self.embedder_name = Some(embedder_name.into()); - self - } - pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> { self.time_budget = time_budget; self @@ -161,15 +155,6 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let embedder_name; - let embedder_name = match &self.embedder_name { - Some(embedder_name) => embedder_name, - None => { - embedder_name = self.index.default_embedding_name(self.rtxn)?; - &embedder_name - } - }; - let mut ctx = SearchContext::new(self.index, self.rtxn); if let Some(searchable_attributes) = self.searchable_attributes { @@ -184,21 +169,23 @@ impl<'a> Search<'a> { document_scores, degraded, used_negative_operator, - } = match self.vector.as_ref() { - Some(vector) => execute_vector_search( - &mut ctx, - vector, - self.scoring_strategy, - universe, - &self.sort_criteria, - self.geo_strategy, - self.offset, - self.limit, - self.distribution_shift, - embedder_name, - self.time_budget.clone(), - )?, - None => execute_search( + } = match self.semantic.as_ref() { + Some(SemanticSearch { vector: Some(vector), embedder_name, embedder }) => { + execute_vector_search( + &mut ctx, + vector, + self.scoring_strategy, + universe, + &self.sort_criteria, + self.geo_strategy, + self.offset, + self.limit, + embedder_name, + embedder, + self.time_budget.clone(), + )? + } + _ => execute_search( &mut ctx, self.query.as_deref(), self.terms_matching_strategy, @@ -237,7 +224,6 @@ impl fmt::Debug for Search<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let Search { query, - vector: _, filter, offset, limit, @@ -250,8 +236,7 @@ impl fmt::Debug for Search<'_> { exhaustive_number_hits, rtxn: _, index: _, - distribution_shift, - embedder_name, + semantic, time_budget, } = self; f.debug_struct("Search") @@ -266,8 +251,10 @@ impl fmt::Debug for Search<'_> { .field("scoring_strategy", scoring_strategy) .field("exhaustive_number_hits", exhaustive_number_hits) .field("words_limit", words_limit) - .field("distribution_shift", distribution_shift) - .field("embedder_name", embedder_name) + .field( + "semantic.embedder_name", + &semantic.as_ref().map(|semantic| &semantic.embedder_name), + ) .field("time_budget", time_budget) .finish() } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 1f0ae7b29..617068ef8 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -52,7 +52,7 @@ use self::vector_sort::VectorSort; use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; -use crate::vector::DistributionShift; +use crate::vector::Embedder; use crate::{ AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, UserError, @@ -298,8 +298,8 @@ fn get_ranking_rules_for_vector<'ctx>( geo_strategy: geo_sort::Strategy, limit_plus_offset: usize, target: &[f32], - distribution_shift: Option, embedder_name: &str, + embedder: &Embedder, ) -> Result>> { // query graph search @@ -325,8 +325,8 @@ fn get_ranking_rules_for_vector<'ctx>( target.to_vec(), vector_candidates, limit_plus_offset, - distribution_shift, embedder_name, + embedder, )?; ranking_rules.push(Box::new(vector_sort)); vector = true; @@ -548,8 +548,8 @@ pub fn execute_vector_search( geo_strategy: geo_sort::Strategy, from: usize, length: usize, - distribution_shift: Option, embedder_name: &str, + embedder: &Embedder, time_budget: TimeBudget, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -562,8 +562,8 @@ pub fn execute_vector_search( geo_strategy, from + length, vector, - distribution_shift, embedder_name, + embedder, )?; let mut placeholder_search_logger = logger::DefaultSearchLogger; diff --git a/milli/src/search/new/vector_sort.rs b/milli/src/search/new/vector_sort.rs index 476477218..de272ed47 100644 --- a/milli/src/search/new/vector_sort.rs +++ b/milli/src/search/new/vector_sort.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::score_details::{self, ScoreDetails}; -use crate::vector::DistributionShift; +use crate::vector::{DistributionShift, Embedder}; use crate::{DocumentId, Result, SearchContext, SearchLogger}; pub struct VectorSort { @@ -24,8 +24,8 @@ impl VectorSort { target: Vec, vector_candidates: RoaringBitmap, limit: usize, - distribution_shift: Option, embedder_name: &str, + embedder: &Embedder, ) -> Result { let embedder_index = ctx .index @@ -39,7 +39,7 @@ impl VectorSort { vector_candidates, cached_sorted_docids: Default::default(), limit, - distribution_shift, + distribution_shift: embedder.distribution(), embedder_index, }) } From 466d718a05c8c3b69738816594f0ed5a45e63bbe Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 11:51:41 +0100 Subject: [PATCH 06/13] Fix test --- milli/src/update/index_documents/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index dbacb4002..d534661da 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2672,7 +2672,16 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); - let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap(); + let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); + let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let embedder = + std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); + assert_eq!("manual", embedder_name); + let res = index + .search(&rtxn) + .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) + .execute() + .unwrap(); assert_eq!(res.documents_ids.len(), 3); } From 4564a38ae7d94ce36cc3a20a34b59bea6162d534 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 12:06:59 +0100 Subject: [PATCH 07/13] Bail earlier when the experimental feature is not enabled --- .../src/routes/indexes/facet_search.rs | 4 ++-- meilisearch/src/routes/indexes/search.rs | 21 ++++++++++++------- meilisearch/src/routes/multi_search.rs | 13 ++++++------ meilisearch/src/search.rs | 17 ++------------- 4 files changed, 24 insertions(+), 31 deletions(-) diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 56880a472..4d6950988 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -74,10 +74,10 @@ pub async fn search( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&search_query, &index_scheduler, &index)?; + let search_kind = search_kind(&search_query, &index_scheduler, &index, features)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_facet_search(&index, search_query, facet_query, facet_name, features, search_kind) + perform_facet_search(&index, search_query, facet_query, facet_name, search_kind) }) .await?; diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index a5fe3c5d6..0f7d3b1ee 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -1,7 +1,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::{AwebJson, AwebQueryParameter}; -use index_scheduler::IndexScheduler; +use index_scheduler::{IndexScheduler, RoFeatures}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; @@ -204,12 +204,11 @@ pub async fn search_with_url_query( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, features, search_kind)) - .await?; + tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -245,12 +244,11 @@ pub async fn search_with_post( let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let _permit = search_queue.try_get_search_permit().await?; let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, features, search_kind)) - .await?; + tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); if search_result.degraded { @@ -269,7 +267,16 @@ pub fn search_kind( query: &SearchQuery, index_scheduler: &IndexScheduler, index: &milli::Index, + features: RoFeatures, ) -> Result { + if query.vector.is_some() { + features.check_vector("Passing `vector` as a query parameter")?; + } + + if query.hybrid.is_some() { + features.check_vector("Passing `hybrid` as a query parameter")?; + } + // regardless of anything, always do a semantic search when we don't have a vector and the query is whitespace or missing if query.vector.is_none() { match &query.q { diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 04cd3f637..7b7cbd265 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -81,14 +81,13 @@ pub async fn multi_search_with_post( }) .with_index(query_index)?; - let search_kind = - search_kind(&query, index_scheduler.get_ref(), &index).with_index(query_index)?; + let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) + .with_index(query_index)?; - let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, features, search_kind) - }) - .await - .with_index(query_index)?; + let search_result = + tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) + .await + .with_index(query_index)?; search_results.push(SearchResultWithIndex { index_uid: index_uid.into_inner(), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 2a22cb2ce..7cb860f2e 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -6,7 +6,6 @@ use std::time::{Duration, Instant}; use deserr::Deserr; use either::Either; -use index_scheduler::RoFeatures; use indexmap::IndexMap; use meilisearch_auth::IndexSearchRules; use meilisearch_types::deserr::DeserrJsonError; @@ -448,21 +447,12 @@ fn prepare_search<'t>( index: &'t Index, rtxn: &'t RoTxn, query: &'t SearchQuery, - features: RoFeatures, search_kind: &SearchKind, time_budget: TimeBudget, ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); search.time_budget(time_budget); - if query.vector.is_some() { - features.check_vector("Passing `vector` as a query parameter")?; - } - - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a query parameter")?; - } - match search_kind { SearchKind::KeywordOnly => { if let Some(q) = &query.q { @@ -551,7 +541,6 @@ fn prepare_search<'t>( pub fn perform_search( index: &Index, query: SearchQuery, - features: RoFeatures, search_kind: SearchKind, ) -> Result { let before_search = Instant::now(); @@ -562,7 +551,7 @@ pub fn perform_search( }; let (search, is_finite_pagination, max_total_hits, offset) = - prepare_search(index, &rtxn, &query, features, &search_kind, time_budget)?; + prepare_search(index, &rtxn, &query, &search_kind, time_budget)?; let milli::SearchResult { documents_ids, @@ -780,7 +769,6 @@ pub fn perform_facet_search( search_query: SearchQuery, facet_query: Option, facet_name: String, - features: RoFeatures, search_kind: SearchKind, ) -> Result { let before_search = Instant::now(); @@ -790,8 +778,7 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; - let (search, _, _, _) = - prepare_search(index, &rtxn, &search_query, features, &search_kind, time_budget)?; + let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, &search_kind, time_budget)?; let mut facet_search = SearchForFacetValues::new( facet_name, search, From 3c6e9851a4bcf85a446f1584b569a056c4139a90 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 28 Mar 2024 15:26:21 +0100 Subject: [PATCH 08/13] Correct error formatting --- milli/src/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index aba80b475..1d61bef63 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -196,7 +196,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), - #[error("Cannot find embedder with name {0}.")] + #[error("Cannot find embedder with name `{0}`.")] InvalidEmbedder(String), #[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")] TooManyVectors(String, usize), From 1ff2a2d6fb20c25a51f87869b833b17f307f659f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Apr 2024 09:35:07 +0200 Subject: [PATCH 09/13] Add semanticHitCount --- .../src/analytics/segment_analytics.rs | 1 + meilisearch/src/search.rs | 31 ++++--- meilisearch/tests/search/hybrid.rs | 30 +++++++ milli/src/search/hybrid.rs | 82 +++++++++++++------ 4 files changed, 108 insertions(+), 36 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index fcf4d9144..c49a04576 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -760,6 +760,7 @@ impl SearchAggregator { query: _, processing_time_ms, hits_info: _, + semantic_hit_count: _, facet_distribution: _, facet_stats: _, degraded, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 7cb860f2e..85438e816 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -385,6 +385,9 @@ pub struct SearchResult { #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub semantic_hit_count: Option, + // These fields are only used for analytics purposes #[serde(skip)] pub degraded: bool, @@ -553,16 +556,23 @@ pub fn perform_search( let (search, is_finite_pagination, max_total_hits, offset) = prepare_search(index, &rtxn, &query, &search_kind, time_budget)?; - let milli::SearchResult { - documents_ids, - matching_words, - candidates, - document_scores, - degraded, - used_negative_operator, - .. - } = match &search_kind { - SearchKind::KeywordOnly | SearchKind::SemanticOnly { .. } => search.execute()?, + let ( + milli::SearchResult { + documents_ids, + matching_words, + candidates, + document_scores, + degraded, + used_negative_operator, + }, + semantic_hit_count, + ) = match &search_kind { + SearchKind::KeywordOnly => (search.execute()?, None), + SearchKind::SemanticOnly { .. } => { + let results = search.execute()?; + let semantic_hit_count = results.document_scores.len() as u32; + (results, Some(semantic_hit_count)) + } SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, }; @@ -760,6 +770,7 @@ pub fn perform_search( facet_stats, degraded, used_negative_operator, + semantic_hit_count, }; Ok(result) } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 8decb7ded..77c4f30a3 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -77,6 +77,16 @@ async fn simple_search() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["semanticHitCount"], @"0"); + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"1"); let (response, code) = index .search_post( @@ -85,6 +95,7 @@ async fn simple_search() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"3"); } #[actix_rt::test] @@ -136,6 +147,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], @@ -149,6 +161,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index @@ -163,6 +176,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}}]"###); + snapshot!(response["semanticHitCount"], @"3"); } #[actix_rt::test] @@ -250,4 +264,20 @@ async fn single_document() { snapshot!(code, @"200 OK"); snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); + snapshot!(response["semanticHitCount"], @"1"); +} + +#[actix_rt::test] +async fn query_combination() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + // search without query and vector, but with hybrid => still placeholder + let (response, code) = index + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); + snapshot!(response["semanticHitCount"], @"1"); } diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index e45652206..fc13a5e1e 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -84,45 +84,73 @@ impl ScoreWithRatioResult { } } - fn merge(left: Self, right: Self, from: usize, length: usize) -> SearchResult { - let mut documents_ids = - Vec::with_capacity(left.document_scores.len() + right.document_scores.len()); - let mut document_scores = - Vec::with_capacity(left.document_scores.len() + right.document_scores.len()); + fn merge( + vector_results: Self, + keyword_results: Self, + from: usize, + length: usize, + ) -> (SearchResult, u32) { + #[derive(Clone, Copy)] + enum ResultSource { + Semantic, + Keyword, + } + let mut semantic_hit_count = 0; + + let mut documents_ids = Vec::with_capacity( + vector_results.document_scores.len() + keyword_results.document_scores.len(), + ); + let mut document_scores = Vec::with_capacity( + vector_results.document_scores.len() + keyword_results.document_scores.len(), + ); let mut documents_seen = RoaringBitmap::new(); - for (docid, (main_score, _sub_score)) in left + for ((docid, (main_score, _sub_score)), source) in vector_results .document_scores .into_iter() - .merge_by(right.document_scores.into_iter(), |(_, left), (_, right)| { - // the first value is the one with the greatest score - compare_scores(left, right).is_ge() - }) + .zip(std::iter::repeat(ResultSource::Semantic)) + .merge_by( + keyword_results + .document_scores + .into_iter() + .zip(std::iter::repeat(ResultSource::Keyword)), + |((_, left), _), ((_, right), _)| { + // the first value is the one with the greatest score + compare_scores(left, right).is_ge() + }, + ) // remove documents we already saw - .filter(|(docid, _)| documents_seen.insert(*docid)) + .filter(|((docid, _), _)| documents_seen.insert(*docid)) // start skipping **after** the filter .skip(from) // take **after** skipping .take(length) { + if let ResultSource::Semantic = source { + semantic_hit_count += 1; + } documents_ids.push(docid); // TODO: pass both scores to documents_score in some way? document_scores.push(main_score); } - SearchResult { - matching_words: right.matching_words, - candidates: left.candidates | right.candidates, - documents_ids, - document_scores, - degraded: left.degraded | right.degraded, - used_negative_operator: left.used_negative_operator | right.used_negative_operator, - } + ( + SearchResult { + matching_words: keyword_results.matching_words, + candidates: vector_results.candidates | keyword_results.candidates, + documents_ids, + document_scores, + degraded: vector_results.degraded | keyword_results.degraded, + used_negative_operator: vector_results.used_negative_operator + | keyword_results.used_negative_operator, + }, + semantic_hit_count, + ) } } impl<'a> Search<'a> { - pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result { + pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<(SearchResult, Option)> { // TODO: find classier way to achieve that than to reset vector and query params // create separate keyword and semantic searches let mut search = Search { @@ -148,14 +176,16 @@ impl<'a> Search<'a> { // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); } // no vector search against placeholder search - let Some(query) = search.query.take() else { return Ok(keyword_results) }; + let Some(query) = search.query.take() else { + return Ok((keyword_results, Some(0))); + }; // no embedder, no semantic search let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); }; let vector_query = match vector { @@ -166,7 +196,7 @@ impl<'a> Search<'a> { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); } } } @@ -181,10 +211,10 @@ impl<'a> Search<'a> { let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio); let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio); - let merge_results = + let (merge_results, semantic_hit_count) = ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit); assert!(merge_results.documents_ids.len() <= self.limit); - Ok(merge_results) + Ok((merge_results, Some(semantic_hit_count))) } fn results_good_enough(&self, keyword_results: &SearchResult, semantic_ratio: f32) -> bool { From 7c27417a5d5d119d79d7dd130c4e9d085222ff70 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Apr 2024 10:23:01 +0200 Subject: [PATCH 10/13] Add tests --- meilisearch/tests/search/hybrid.rs | 98 +++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 77c4f30a3..77cfac3d9 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -278,6 +278,100 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); - snapshot!(response["semanticHitCount"], @"1"); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["semanticHitCount"], @"null"); + + // same with a different semantic ratio + let (response, code) = index + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["semanticHitCount"], @"null"); + + // wrong vector dimensions + let (response, code) = index + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid vector dimensions: expected: `2`, found: `3`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + } + "###); + + // full vector + let (response, code) = index + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462,"_semanticScore":0.77735007},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779,"_semanticScore":0.7236068},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712,"_semanticScore":0.6581139}]"###); + snapshot!(response["semanticHitCount"], @"3"); + + // full keyword, without a query + let (response, code) = index + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["semanticHitCount"], @"null"); + + // query + vector, full keyword => keyword + let (response, code) = index + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###); + snapshot!(response["semanticHitCount"], @"null"); + + // query + vector, no hybrid keyword => + let (response, code) = index + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .await; + + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid request: missing `hybrid` parameter when both `q` and `vector` are present.", + "code": "missing_search_hybrid", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_search_hybrid" + } + "###); + + // full vector, without a vector => error + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + ) + .await; + + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: attempt to embed the following text in a configuration where embeddings must be user provided: \"Captain\"", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // hybrid without a vector => full keyword + let (response, code) = index + .search_post( + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + ) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###); + snapshot!(response["semanticHitCount"], @"0"); } From 355e5282b24bbf5e66fe1f18b25620ae70546e61 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Apr 2024 14:29:17 +0200 Subject: [PATCH 11/13] Remove `_semanticScore` --- meilisearch/src/search.rs | 15 +-------------- meilisearch/tests/search/hybrid.rs | 18 ++++++++++-------- meilisearch/tests/search/mod.rs | 13 ++++++++----- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 85438e816..47c2b5f4b 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -13,7 +13,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::milli::score_details::{self, ScoreDetails, ScoringStrategy}; +use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; @@ -368,8 +368,6 @@ pub struct SearchHit { pub ranking_score: Option, #[serde(rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] pub ranking_score_details: Option>, - #[serde(rename = "_semanticScore", skip_serializing_if = "Option::is_none")] - pub semantic_score: Option, } #[derive(Serialize, Debug, Clone, PartialEq)] @@ -683,16 +681,6 @@ pub fn perform_search( insert_geo_distance(sort, &mut document); } - let mut semantic_score = None; - for details in &score { - if let ScoreDetails::Vector(score_details::Vector { similarity: Some(similarity) }) = - details - { - semantic_score = Some(*similarity); - break; - } - } - let ranking_score = query.show_ranking_score.then(|| ScoreDetails::global_score(score.iter())); let ranking_score_details = @@ -704,7 +692,6 @@ pub fn perform_search( matches_position, ranking_score_details, ranking_score, - semantic_score, }; documents.push(hit); } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 77cfac3d9..637242bc7 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -81,20 +81,20 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"1"); let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -152,6 +152,7 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "showRankingScore": true, "attributesToHighlight": [ "desc" ], @@ -160,13 +161,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_semanticScore":0.9472136}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "showRankingScore": true, "attributesToHighlight": [ "desc" ], @@ -175,7 +177,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -263,7 +265,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -311,7 +313,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462,"_semanticScore":0.77735007},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779,"_semanticScore":0.7236068},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712,"_semanticScore":0.6581139}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index ce45b0d4f..b4350f686 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1040,6 +1040,7 @@ async fn experimental_feature_vector_store() { let (response, code) = index .search_post(json!({ "vector": [1.0, 2.0, 3.0], + "showRankingScore": true })) .await; meili_snap::snapshot!(code, @"400 Bad Request"); @@ -1082,6 +1083,7 @@ async fn experimental_feature_vector_store() { let (response, code) = index .search_post(json!({ "vector": [1.0, 2.0, 3.0], + "showRankingScore": true, })) .await; @@ -1099,7 +1101,7 @@ async fn experimental_feature_vector_store() { 3 ] }, - "_semanticScore": 1.0 + "_rankingScore": 1.0 }, { "title": "Captain Marvel", @@ -1111,7 +1113,7 @@ async fn experimental_feature_vector_store() { 54 ] }, - "_semanticScore": 0.9129112 + "_rankingScore": 0.9129111766815186 }, { "title": "Gläss", @@ -1123,7 +1125,7 @@ async fn experimental_feature_vector_store() { 90 ] }, - "_semanticScore": 0.8106413 + "_rankingScore": 0.8106412887573242 }, { "title": "How to Train Your Dragon: The Hidden World", @@ -1135,7 +1137,7 @@ async fn experimental_feature_vector_store() { 32 ] }, - "_semanticScore": 0.74120104 + "_rankingScore": 0.7412010431289673 }, { "title": "Escape Room", @@ -1146,7 +1148,8 @@ async fn experimental_feature_vector_store() { -23, 32 ] - } + }, + "_rankingScore": 0.6972063183784485 } ] "###); From ca499a03025a2104de90a1a554faa578c6e472b9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Apr 2024 15:04:20 +0200 Subject: [PATCH 12/13] Fix test after rebase --- meilisearch/tests/search/hybrid.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 637242bc7..68ae4c0aa 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -106,7 +106,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444,"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669,"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112,"_semanticScore":0.9472136}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -127,7 +127,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375,"_semanticScore":0.19161224},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7,"_semanticScore":1.1920929e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7,"_semanticScore":1.1920929e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] From a9013ed68316b68b2a4a9e069c16c070aaa161e1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 4 Apr 2024 17:21:47 +0200 Subject: [PATCH 13/13] Fix comment mistake Co-authored-by: Tamo --- meilisearch/src/routes/indexes/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 0f7d3b1ee..5581e6a68 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -277,7 +277,7 @@ pub fn search_kind( features.check_vector("Passing `hybrid` as a query parameter")?; } - // regardless of anything, always do a semantic search when we don't have a vector and the query is whitespace or missing + // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing if query.vector.is_none() { match &query.q { Some(q) if q.trim().is_empty() => return Ok(SearchKind::KeywordOnly),