4621: Bring back changes from v1.8.0 into main r=curquiza a=curquiza



Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-05-06 13:46:39 +00:00 committed by GitHub
commit 4d5971f343
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 1450 additions and 802 deletions

View file

@ -7,7 +7,6 @@ use serde_json::Value;
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::Opt;
pub struct MockAnalytics {
@ -86,6 +85,4 @@ impl Analytics for MockAnalytics {
}
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {}
fn health_seen(&self, _request: &HttpRequest) {}
}

View file

@ -14,7 +14,6 @@ use platform_dirs::AppDirs;
use serde_json::Value;
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
// if the analytics feature is disabled
// the `SegmentAnalytics` point to the mock instead of the real analytics
@ -117,10 +116,4 @@ pub trait Analytics: Sync + Send {
index_creation: bool,
request: &HttpRequest,
);
// this method should be called to aggregate the get tasks requests.
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest);
// this method should be called to aggregate a add documents request
fn health_seen(&self, request: &HttpRequest);
}

View file

@ -33,7 +33,6 @@ use crate::option::{
};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::indexes::facet_search::FacetSearchQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::routes::{create_all_stats, Stats};
use crate::search::{
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
@ -81,8 +80,6 @@ pub enum AnalyticsMsg {
AggregateUpdateDocuments(DocumentsAggregator),
AggregateGetFetchDocuments(DocumentsFetchAggregator),
AggregatePostFetchDocuments(DocumentsFetchAggregator),
AggregateTasks(TasksAggregator),
AggregateHealth(HealthAggregator),
}
pub struct SegmentAnalytics {
@ -152,8 +149,6 @@ impl SegmentAnalytics {
update_documents_aggregator: DocumentsAggregator::default(),
get_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
post_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
get_tasks_aggregator: TasksAggregator::default(),
health_aggregator: HealthAggregator::default(),
});
tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone()));
@ -231,16 +226,6 @@ impl super::Analytics for SegmentAnalytics {
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate));
}
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) {
let aggregate = TasksAggregator::from_query(query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate));
}
fn health_seen(&self, request: &HttpRequest) {
let aggregate = HealthAggregator::from_query(request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate));
}
}
/// This structure represent the `infos` field we send in the analytics.
@ -394,8 +379,6 @@ pub struct Segment {
update_documents_aggregator: DocumentsAggregator,
get_fetch_documents_aggregator: DocumentsFetchAggregator,
post_fetch_documents_aggregator: DocumentsFetchAggregator,
get_tasks_aggregator: TasksAggregator,
health_aggregator: HealthAggregator,
}
impl Segment {
@ -458,8 +441,6 @@ impl Segment {
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg),
None => (),
}
}
@ -513,8 +494,6 @@ impl Segment {
update_documents_aggregator,
get_fetch_documents_aggregator,
post_fetch_documents_aggregator,
get_tasks_aggregator,
health_aggregator,
} = self;
if let Some(get_search) =
@ -562,12 +541,6 @@ impl Segment {
{
let _ = self.batcher.push(post_fetch_documents).await;
}
if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") {
let _ = self.batcher.push(get_tasks).await;
}
if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") {
let _ = self.batcher.push(health).await;
}
let _ = self.batcher.flush().await;
}
}
@ -1503,176 +1476,6 @@ impl DocumentsDeletionAggregator {
}
}
#[derive(Default, Serialize)]
pub struct TasksAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
filtered_by_uid: bool,
filtered_by_index_uid: bool,
filtered_by_type: bool,
filtered_by_status: bool,
filtered_by_canceled_by: bool,
filtered_by_before_enqueued_at: bool,
filtered_by_after_enqueued_at: bool,
filtered_by_before_started_at: bool,
filtered_by_after_started_at: bool,
filtered_by_before_finished_at: bool,
filtered_by_after_finished_at: bool,
total_received: usize,
}
impl TasksAggregator {
pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self {
let TasksFilterQuery {
limit: _,
from: _,
uids,
index_uids,
types,
statuses,
canceled_by,
before_enqueued_at,
after_enqueued_at,
before_started_at,
after_started_at,
before_finished_at,
after_finished_at,
} = query;
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
filtered_by_uid: uids.is_some(),
filtered_by_index_uid: index_uids.is_some(),
filtered_by_type: types.is_some(),
filtered_by_status: statuses.is_some(),
filtered_by_canceled_by: canceled_by.is_some(),
filtered_by_before_enqueued_at: before_enqueued_at.is_some(),
filtered_by_after_enqueued_at: after_enqueued_at.is_some(),
filtered_by_before_started_at: before_started_at.is_some(),
filtered_by_after_started_at: after_started_at.is_some(),
filtered_by_before_finished_at: before_finished_at.is_some(),
filtered_by_after_finished_at: after_finished_at.is_some(),
total_received: 1,
}
}
/// Aggregate one [TasksAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self {
timestamp,
user_agents,
total_received,
filtered_by_uid,
filtered_by_index_uid,
filtered_by_type,
filtered_by_status,
filtered_by_canceled_by,
filtered_by_before_enqueued_at,
filtered_by_after_enqueued_at,
filtered_by_before_started_at,
filtered_by_after_started_at,
filtered_by_before_finished_at,
filtered_by_after_finished_at,
} = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.filtered_by_uid |= filtered_by_uid;
self.filtered_by_index_uid |= filtered_by_index_uid;
self.filtered_by_type |= filtered_by_type;
self.filtered_by_status |= filtered_by_status;
self.filtered_by_canceled_by |= filtered_by_canceled_by;
self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at;
self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at;
self.filtered_by_before_started_at |= filtered_by_before_started_at;
self.filtered_by_after_started_at |= filtered_by_after_started_at;
self.filtered_by_before_finished_at |= filtered_by_before_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)]
pub struct HealthAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
#[serde(rename = "requests.total_received")]
total_received: usize,
}
impl HealthAggregator {
pub fn from_query(request: &HttpRequest) -> Self {
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1,
}
}
/// Aggregate one [HealthAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self { timestamp, user_agents, total_received } = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)]
pub struct DocumentsFetchAggregator {
#[serde(skip)]

View file

@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError};
use clap::Parser;
use meilisearch_types::features::InstanceTogglableFeatures;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::ThreadPoolNoAbortBuilder;
use rustls::server::{
AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache,
};
@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = rayon::ThreadPoolBuilder::new()
let thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("indexing-thread:{index}"))
.num_threads(*other.max_indexing_threads)
.build()?;

View file

@ -269,12 +269,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
pub async fn get_index_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req));
let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?);
debug!(returns = ?stats, "Get index stats");

View file

@ -137,10 +137,8 @@ macro_rules! make_setting_route {
let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?;
debug!(returns = ?settings, "Update settings");
let mut json = serde_json::json!(&settings);
let val = json[$camelcase_attr].take();
Ok(HttpResponse::Ok().json(val))
Ok(HttpResponse::Ok().json(settings.$attr))
}
pub fn resources() -> Resource {

View file

@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::settings::{Settings, Unchecked};
use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
use serde::{Deserialize, Serialize};
use serde_json::json;
use time::OffsetDateTime;
use tracing::debug;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::search_queue::SearchQueue;
@ -296,10 +294,7 @@ pub struct Stats {
async fn get_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req));
let filters = index_scheduler.filters();
let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?;
@ -355,11 +350,7 @@ struct VersionResponse {
async fn get_version(
_index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> HttpResponse {
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
let build_info = build_info::BuildInfo::from_build();
HttpResponse::Ok().json(VersionResponse {
@ -377,14 +368,10 @@ async fn get_version(
}
pub async fn get_health(
req: HttpRequest,
index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>,
search_queue: Data<SearchQueue>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.health_seen(&req);
search_queue.health().unwrap();
index_scheduler.health().unwrap();
auth_controller.health().unwrap();

View file

@ -270,12 +270,8 @@ pub struct AllTasks {
async fn get_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let mut params = params.into_inner();
analytics.get_tasks(&params, &req);
// We +1 just to know if there is more after this "page" or not.
params.limit.0 = params.limit.0.saturating_add(1);
let limit = params.limit.0;
@ -298,8 +294,6 @@ async fn get_tasks(
async fn get_task(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
task_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let task_uid_string = task_uid.into_inner();
@ -310,8 +304,6 @@ async fn get_task(
}
};
analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req));
let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() };
let filters = index_scheduler.filters();
let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?;

View file

@ -1,3 +1,4 @@
use core::fmt;
use std::cmp::min;
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::str::FromStr;
@ -39,7 +40,7 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5);
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[derive(Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
@ -88,6 +89,110 @@ pub struct SearchQuery {
pub attributes_to_search_on: Option<Vec<String>>,
}
// Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum.
// - Only what IS used, we know everything else is set to None so there is no need to print it
// - Re-order the most important field to debug first
impl fmt::Debug for SearchQuery {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Self {
q,
vector,
hybrid,
offset,
limit,
page,
hits_per_page,
attributes_to_retrieve,
attributes_to_crop,
crop_length,
attributes_to_highlight,
show_matches_position,
show_ranking_score,
show_ranking_score_details,
filter,
sort,
facets,
highlight_pre_tag,
highlight_post_tag,
crop_marker,
matching_strategy,
attributes_to_search_on,
} = self;
let mut debug = f.debug_struct("SearchQuery");
// First, everything related to the number of documents to retrieve
debug.field("limit", &limit).field("offset", &offset);
if let Some(page) = page {
debug.field("page", &page);
}
if let Some(hits_per_page) = hits_per_page {
debug.field("hits_per_page", &hits_per_page);
}
// Then, everything related to the queries
if let Some(q) = q {
debug.field("q", &q);
}
if let Some(v) = vector {
if v.len() < 10 {
debug.field("vector", &v);
} else {
debug.field(
"vector",
&format!("[{}, {}, {}, ... {} dimensions]", v[0], v[1], v[2], v.len()),
);
}
}
if let Some(hybrid) = hybrid {
debug.field("hybrid", &hybrid);
}
if let Some(attributes_to_search_on) = attributes_to_search_on {
debug.field("attributes_to_search_on", &attributes_to_search_on);
}
if let Some(filter) = filter {
debug.field("filter", &filter);
}
if let Some(sort) = sort {
debug.field("sort", &sort);
}
if let Some(facets) = facets {
debug.field("facets", &facets);
}
debug.field("matching_strategy", &matching_strategy);
// Then everything related to the formatting
debug.field("crop_length", &crop_length);
if *show_matches_position {
debug.field("show_matches_position", show_matches_position);
}
if *show_ranking_score {
debug.field("show_ranking_score", show_ranking_score);
}
if *show_ranking_score_details {
debug.field("self.show_ranking_score_details", show_ranking_score_details);
}
debug.field("crop_length", &crop_length);
if let Some(facets) = facets {
debug.field("facets", &facets);
}
if let Some(attributes_to_retrieve) = attributes_to_retrieve {
debug.field("attributes_to_retrieve", &attributes_to_retrieve);
}
if let Some(attributes_to_crop) = attributes_to_crop {
debug.field("attributes_to_crop", &attributes_to_crop);
}
if let Some(attributes_to_highlight) = attributes_to_highlight {
debug.field("attributes_to_highlight", &attributes_to_highlight);
}
debug.field("highlight_pre_tag", &highlight_pre_tag);
debug.field("highlight_post_tag", &highlight_post_tag);
debug.field("crop_marker", &crop_marker);
debug.finish()
}
}
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)]
pub struct HybridQuery {
@ -370,7 +475,7 @@ pub struct SearchHit {
pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>,
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[derive(Serialize, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
@ -393,6 +498,46 @@ pub struct SearchResult {
pub used_negative_operator: bool,
}
impl fmt::Debug for SearchResult {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let SearchResult {
hits,
query,
processing_time_ms,
hits_info,
facet_distribution,
facet_stats,
semantic_hit_count,
degraded,
used_negative_operator,
} = self;
let mut debug = f.debug_struct("SearchResult");
// The most important thing when looking at a search result is the time it took to process
debug.field("processing_time_ms", &processing_time_ms);
debug.field("hits", &format!("[{} hits returned]", hits.len()));
debug.field("query", &query);
debug.field("hits_info", &hits_info);
if *used_negative_operator {
debug.field("used_negative_operator", used_negative_operator);
}
if *degraded {
debug.field("degraded", degraded);
}
if let Some(facet_distribution) = facet_distribution {
debug.field("facet_distribution", &facet_distribution);
}
if let Some(facet_stats) = facet_stats {
debug.field("facet_stats", &facet_stats);
}
if let Some(semantic_hit_count) = semantic_hit_count {
debug.field("semantic_hit_count", &semantic_hit_count);
}
debug.finish()
}
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct SearchResultWithIndex {