Small commit to add hybrid search and autoembedding

This commit is contained in:
Louis Dureuil 2023-11-15 15:46:37 +01:00
parent 21bcf32109
commit 13c2c6c16b
No known key found for this signature in database
42 changed files with 4045 additions and 246 deletions

View file

@ -686,7 +686,7 @@ impl SearchAggregator {
ret.max_terms_number = q.split_whitespace().count();
}
if let Some(ref vector) = vector {
if let Some(meilisearch_types::milli::VectorQuery::Vector(ref vector)) = vector {
ret.max_vector_size = vector.len();
}

View file

@ -19,7 +19,11 @@ static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
/// does all the setup before meilisearch is launched
fn setup(opt: &Opt) -> anyhow::Result<()> {
let mut log_builder = env_logger::Builder::new();
log_builder.parse_filters(&opt.log_level.to_string());
let log_filters = format!(
"{},h2=warn,hyper=warn,tokio_util=warn,tracing=warn,rustls=warn,mio=warn,reqwest=warn",
opt.log_level
);
log_builder.parse_filters(&log_filters);
log_builder.init();

View file

@ -7,6 +7,7 @@ use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::VectorQuery;
use serde_json::Value;
use crate::analytics::{Analytics, FacetSearchAggregator};
@ -117,7 +118,7 @@ impl From<FacetSearchQuery> for SearchQuery {
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),
crop_marker: DEFAULT_CROP_MARKER(),
matching_strategy,
vector,
vector: vector.map(VectorQuery::Vector),
attributes_to_search_on,
}
}

View file

@ -2,12 +2,13 @@ use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use deserr::actix_web::{AwebJson, AwebQueryParameter};
use index_scheduler::IndexScheduler;
use log::debug;
use log::{debug, warn};
use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::VectorQuery;
use meilisearch_types::serde_cs::vec::CS;
use serde_json::Value;
@ -88,7 +89,7 @@ impl From<SearchQueryGet> for SearchQuery {
Self {
q: other.q,
vector: other.vector.map(CS::into_inner),
vector: other.vector.map(CS::into_inner).map(VectorQuery::Vector),
offset: other.offset.0,
limit: other.limit.0,
page: other.page.as_deref().copied(),
@ -193,6 +194,9 @@ pub async fn search_with_post(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
embed(&mut query, index_scheduler.get_ref(), &index).await?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {
@ -206,6 +210,38 @@ pub async fn search_with_post(
Ok(HttpResponse::Ok().json(search_result))
}
pub async fn embed(
query: &mut SearchQuery,
index_scheduler: &IndexScheduler,
index: &meilisearch_types::milli::Index,
) -> Result<(), ResponseError> {
if let Some(VectorQuery::String(prompt)) = query.vector.take() {
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
let embedder = index_scheduler.embedders(embedder_configs)?;
/// FIXME: add error if no embedder, remove unwrap, support multiple embedders
let embeddings = embedder
.get("default")
.unwrap()
.0
.embed(vec![prompt])
.await
.map_err(meilisearch_types::milli::vector::Error::from)
.map_err(meilisearch_types::milli::UserError::from)
.map_err(meilisearch_types::milli::Error::from)?
.pop()
.expect("No vector returned from embedding");
if embeddings.iter().nth(1).is_some() {
warn!("Ignoring embeddings past the first one in long search query");
query.vector = Some(VectorQuery::Vector(embeddings.iter().next().unwrap().to_vec()));
} else {
query.vector = Some(VectorQuery::Vector(embeddings.into_inner()));
}
};
Ok(())
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -13,6 +13,7 @@ use crate::analytics::{Analytics, MultiSearchAggregator};
use crate::extractors::authentication::policies::ActionPolicy;
use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::search::embed;
use crate::search::{
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
};
@ -74,6 +75,8 @@ pub async fn multi_search_with_post(
})
.with_index(query_index)?;
embed(&mut query, index_scheduler.get_ref(), &index).await.with_index(query_index)?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features))
.await

View file

@ -16,6 +16,7 @@ use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::{
dot_product_similarity, FacetValueHit, InternalError, OrderBy, SearchForFacetValues,
VectorQuery,
};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
use meilisearch_types::{milli, Document};
@ -46,7 +47,7 @@ pub struct SearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchVector>)]
pub vector: Option<Vec<f32>>,
pub vector: Option<milli::VectorQuery>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -105,7 +106,7 @@ pub struct SearchQueryWithIndex {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
pub vector: Option<VectorQuery>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -339,11 +340,18 @@ fn prepare_search<'t>(
let mut search = index.search(rtxn);
if query.vector.is_some() && query.q.is_some() {
warn!("Ignoring the query string `q` when used with the `vector` parameter.");
warn!("Attempting hybrid search");
}
if let Some(ref vector) = query.vector {
search.vector(vector.clone());
match vector {
VectorQuery::Vector(vector) => {
search.vector(vector.clone());
}
VectorQuery::String(_) => {
panic!("Failed while preparing search; caller did not generate embedding for query")
}
}
}
if let Some(ref query) = query.q {
@ -375,7 +383,7 @@ fn prepare_search<'t>(
}
if query.vector.is_some() {
features.check_vector()?;
features.check_vector("Passing `vector` as a query parameter")?;
}
// compute the offset on the limit depending on the pagination mode.
@ -429,7 +437,11 @@ pub fn perform_search(
prepare_search(index, &rtxn, &query, features)?;
let milli::SearchResult { documents_ids, matching_words, candidates, document_scores, .. } =
search.execute()?;
if query.q.is_some() && query.vector.is_some() {
search.execute_hybrid()?
} else {
search.execute()?
};
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
@ -538,13 +550,13 @@ pub fn perform_search(
insert_geo_distance(sort, &mut document);
}
let semantic_score = match query.vector.as_ref() {
let semantic_score = /*match query.vector.as_ref() {
Some(vector) => match extract_field("_vectors", &fields_ids_map, obkv)? {
Some(vectors) => compute_semantic_score(vector, vectors)?,
None => None,
},
None => None,
};
};*/ None;
let ranking_score =
query.show_ranking_score.then(|| ScoreDetails::global_score(score.iter()));
@ -629,7 +641,8 @@ pub fn perform_search(
hits: documents,
hits_info,
query: query.q.unwrap_or_default(),
vector: query.vector,
// FIXME: display input vector
vector: None,
processing_time_ms: before_search.elapsed().as_millis(),
facet_distribution,
facet_stats,