From 5675585fe8b4f51eed7b08bb30e1fed0f711e340 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:54:43 +0200 Subject: [PATCH] move all the searches structures to new modules --- meilisearch/src/analytics/mod.rs | 4 - .../src/analytics/segment_analytics.rs | 868 +----------------- meilisearch/src/routes/indexes/mod.rs | 2 + meilisearch/src/routes/indexes/search.rs | 4 +- .../src/routes/indexes/search_analytics.rs | 485 ++++++++++ meilisearch/src/routes/indexes/similar.rs | 4 +- .../src/routes/indexes/similar_analytics.rs | 235 +++++ meilisearch/src/routes/mod.rs | 1 + meilisearch/src/routes/multi_search.rs | 4 +- .../src/routes/multi_search_analytics.rs | 170 ++++ 10 files changed, 903 insertions(+), 874 deletions(-) create mode 100644 meilisearch/src/routes/indexes/search_analytics.rs create mode 100644 meilisearch/src/routes/indexes/similar_analytics.rs create mode 100644 meilisearch/src/routes/multi_search_analytics.rs diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index d72ab9d01..bd14b0bfa 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -15,13 +15,9 @@ use platform_dirs::AppDirs; // if the feature analytics is enabled we use the real analytics pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; -pub use segment_analytics::SearchAggregator; -pub use segment_analytics::SimilarAggregator; use crate::Opt; -pub use self::segment_analytics::MultiSearchAggregator; - /// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. #[macro_export] macro_rules! empty_analytics { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 96a0a676c..7dc746b14 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1,5 +1,5 @@ use std::any::TypeId; -use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -11,10 +11,8 @@ use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; use meilisearch_types::features::RuntimeTogglableFeatures; -use meilisearch_types::locales::Locale; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; -use regex::Regex; use segment::message::{Identify, Track, User}; use segment::{AutoBatcher, Batcher, HttpClient}; use serde::Serialize; @@ -25,17 +23,12 @@ use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; -use super::{config_user_id_path, Aggregate, AggregateMethod, MEILISEARCH_CONFIG_PATH}; +use super::{config_user_id_path, Aggregate, MEILISEARCH_CONFIG_PATH}; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; use crate::routes::{create_all_stats, Stats}; -use crate::search::{ - FederatedSearch, SearchQuery, SearchQueryWithIndex, SearchResult, SimilarQuery, SimilarResult, - DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO, -}; -use crate::{aggregate_methods, Opt}; +use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; @@ -489,858 +482,3 @@ impl Segment { let _ = self.batcher.flush().await; } } - -#[derive(Default)] -pub struct SearchAggregator { - // requests - total_received: usize, - total_succeeded: usize, - total_degraded: usize, - total_used_negative_operator: usize, - time_spent: BinaryHeap, - - // sort - sort_with_geo_point: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - sort_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - sort_total_number_of_criteria: usize, - - // distinct - distinct: bool, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // attributes_to_search_on - // every time a search is done using attributes_to_search_on - attributes_to_search_on_total_number_of_uses: usize, - - // q - // The maximum number of terms in a q request - max_terms_number: usize, - - // vector - // The maximum number of floats in a vector request - max_vector_size: usize, - // Whether the semantic ratio passed to a hybrid search equals the default ratio. - semantic_ratio: bool, - hybrid: bool, - retrieve_vectors: bool, - - // every time a search is done, we increment the counter linked to the used settings - matching_strategy: HashMap, - - // List of the unique Locales passed as parameter - locales: BTreeSet, - - // pagination - max_limit: usize, - max_offset: usize, - finite_pagination: usize, - - // formatting - max_attributes_to_retrieve: usize, - max_attributes_to_highlight: usize, - highlight_pre_tag: bool, - highlight_post_tag: bool, - max_attributes_to_crop: usize, - crop_marker: bool, - show_matches_position: bool, - crop_length: bool, - - // facets - facets_sum_of_terms: usize, - facets_total_number_of_facets: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, - - marker: std::marker::PhantomData, -} - -impl SearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SearchQuery) -> Self { - let SearchQuery { - q, - vector, - offset, - limit, - page, - hits_per_page, - attributes_to_retrieve: _, - retrieve_vectors, - attributes_to_crop: _, - crop_length, - attributes_to_highlight: _, - show_matches_position, - show_ranking_score, - show_ranking_score_details, - filter, - sort, - distinct, - facets: _, - highlight_pre_tag, - highlight_post_tag, - crop_marker, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - - ret.total_received = 1; - - if let Some(ref sort) = sort { - ret.sort_total_number_of_criteria = 1; - ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); - ret.sort_sum_of_criteria_terms = sort.len(); - } - - ret.distinct = distinct.is_some(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - // attributes_to_search_on - if attributes_to_search_on.is_some() { - ret.attributes_to_search_on_total_number_of_uses = 1; - } - - if let Some(ref q) = q { - ret.max_terms_number = q.split_whitespace().count(); - } - - if let Some(ref vector) = vector { - ret.max_vector_size = vector.len(); - } - ret.retrieve_vectors |= retrieve_vectors; - - if query.is_finite_pagination() { - let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); - ret.max_limit = limit; - ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; - ret.finite_pagination = 1; - } else { - ret.max_limit = *limit; - ret.max_offset = *offset; - ret.finite_pagination = 0; - } - - ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); - - if let Some(locales) = locales { - ret.locales = locales.iter().copied().collect(); - } - - ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); - ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); - ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); - ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); - ret.show_matches_position = *show_matches_position; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - if let Some(hybrid) = hybrid { - ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); - ret.hybrid = true; - } - - ret - } - - pub fn succeed(&mut self, result: &SearchResult) { - let SearchResult { - hits: _, - query: _, - processing_time_ms, - hits_info: _, - semantic_hit_count: _, - facet_distribution: _, - facet_stats: _, - degraded, - used_negative_operator, - } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - if *degraded { - self.total_degraded = self.total_degraded.saturating_add(1); - } - if *used_negative_operator { - self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); - } - self.time_spent.push(*processing_time_ms as usize); - } -} - -aggregate_methods!( - SearchGET => "Documents Searched GET", - SearchPOST => "Documents Searched POST", -); - -impl Aggregate for SearchAggregator { - fn event_name(&self) -> &'static str { - Method::event_name() - } - - fn aggregate(mut self: Box, new: Box) -> Box { - let Self { - total_received, - total_succeeded, - mut time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - mut locales, - marker: _, - } = *new; - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.total_degraded = self.total_degraded.saturating_add(total_degraded); - self.total_used_negative_operator = - self.total_used_negative_operator.saturating_add(total_used_negative_operator); - self.time_spent.append(&mut time_spent); - - // sort - self.sort_with_geo_point |= sort_with_geo_point; - self.sort_sum_of_criteria_terms = - self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); - self.sort_total_number_of_criteria = - self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); - - // distinct - self.distinct |= distinct; - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - // attributes_to_search_on - self.attributes_to_search_on_total_number_of_uses = self - .attributes_to_search_on_total_number_of_uses - .saturating_add(attributes_to_search_on_total_number_of_uses); - - // q - self.max_terms_number = self.max_terms_number.max(max_terms_number); - - // vector - self.max_vector_size = self.max_vector_size.max(max_vector_size); - self.retrieve_vectors |= retrieve_vectors; - self.semantic_ratio |= semantic_ratio; - self.hybrid |= hybrid; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - self.finite_pagination += finite_pagination; - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - self.max_attributes_to_highlight = - self.max_attributes_to_highlight.max(max_attributes_to_highlight); - self.highlight_pre_tag |= highlight_pre_tag; - self.highlight_post_tag |= highlight_post_tag; - self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); - self.crop_marker |= crop_marker; - self.show_matches_position |= show_matches_position; - self.crop_length |= crop_length; - - // facets - self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); - self.facets_total_number_of_facets = - self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); - - // matching strategy - for (key, value) in matching_strategy.into_iter() { - let matching_strategy = self.matching_strategy.entry(key).or_insert(0); - *matching_strategy = matching_strategy.saturating_add(value); - } - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - // locales - self.locales.append(&mut locales); - - self - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - locales, - marker: _, - } = *self; - - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - json!({ - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - "total_degraded": total_degraded, - "total_used_negative_operator": total_used_negative_operator, - }, - "sort": { - "with_geoPoint": sort_with_geo_point, - "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), - }, - "distinct": distinct, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "attributes_to_search_on": { - "total_number_of_uses": attributes_to_search_on_total_number_of_uses, - }, - "q": { - "max_terms_number": max_terms_number, - }, - "vector": { - "max_vector_size": max_vector_size, - "retrieve_vectors": retrieve_vectors, - }, - "hybrid": { - "enabled": hybrid, - "semantic_ratio": semantic_ratio, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - "max_attributes_to_highlight": max_attributes_to_highlight, - "highlight_pre_tag": highlight_pre_tag, - "highlight_post_tag": highlight_post_tag, - "max_attributes_to_crop": max_attributes_to_crop, - "crop_marker": crop_marker, - "show_matches_position": show_matches_position, - "crop_length": crop_length, - }, - "facets": { - "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), - }, - "matching_strategy": { - "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "locales": locales, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }) - } -} - -#[derive(Default)] -pub struct MultiSearchAggregator { - // requests - total_received: usize, - total_succeeded: usize, - - // sum of the number of distinct indexes in each single request, use with total_received to compute an avg - total_distinct_index_count: usize, - // number of queries with a single index, use with total_received to compute a proportion - total_single_index: usize, - - // sum of the number of search queries in the requests, use with total_received to compute an average - total_search_count: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - - // federation - use_federation: bool, -} - -impl MultiSearchAggregator { - pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { - let use_federation = federated_search.federation.is_some(); - - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; - - index_uid.as_str() - }) - .collect(); - - let show_ranking_score = - federated_search.queries.iter().any(|query| query.show_ranking_score); - let show_ranking_score_details = - federated_search.queries.iter().any(|query| query.show_ranking_score_details); - - Self { - total_received: 1, - total_succeeded: 0, - total_distinct_index_count: distinct_indexes.len(), - total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, - total_search_count: federated_search.queries.len(), - show_ranking_score, - show_ranking_score_details, - use_federation, - } - } - - pub fn succeed(&mut self) { - self.total_succeeded = self.total_succeeded.saturating_add(1); - } -} - -impl Aggregate for MultiSearchAggregator { - fn event_name(&self) -> &'static str { - "Documents Searched by Multi-Search POST" - } - - /// Aggregate one [MultiSearchAggregator] into another. - fn aggregate(self: Box, new: Box) -> Box { - // write the aggregate in a way that will cause a compilation error if a field is added. - - // get ownership of self, replacing it by a default value. - let this = *self; - - let total_received = this.total_received.saturating_add(new.total_received); - let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); - let total_distinct_index_count = - this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); - let total_single_index = this.total_single_index.saturating_add(new.total_single_index); - let total_search_count = this.total_search_count.saturating_add(new.total_search_count); - let show_ranking_score = this.show_ranking_score || new.show_ranking_score; - let show_ranking_score_details = - this.show_ranking_score_details || new.show_ranking_score_details; - let use_federation = this.use_federation || new.use_federation; - - Box::new(Self { - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - show_ranking_score, - show_ranking_score_details, - use_federation, - }) - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - show_ranking_score, - show_ranking_score_details, - use_federation, - } = *self; - - json!({ - "requests": { - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "indexes": { - "total_single_index": total_single_index, - "total_distinct_index_count": total_distinct_index_count, - "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early - }, - "searches": { - "total_search_count": total_search_count, - "avg_search_count": (total_search_count as f64) / (total_received as f64), - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - }, - "federation": { - "use_federation": use_federation, - } - }) - } -} - -aggregate_methods!( - SimilarPOST => "Similar POST", - SimilarGET => "Similar GET", -); - -#[derive(Default)] -pub struct SimilarAggregator { - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // Whether a non-default embedder was specified - retrieve_vectors: bool, - - // pagination - max_limit: usize, - max_offset: usize, - - // formatting - max_attributes_to_retrieve: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, - - marker: std::marker::PhantomData, -} - -impl SimilarAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SimilarQuery) -> Self { - let SimilarQuery { - id: _, - embedder: _, - offset, - limit, - attributes_to_retrieve: _, - retrieve_vectors, - show_ranking_score, - show_ranking_score_details, - filter, - ranking_score_threshold, - } = query; - - let mut ret = Self::default(); - - ret.total_received = 1; - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - ret.max_limit = *limit; - ret.max_offset = *offset; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - ret.retrieve_vectors = *retrieve_vectors; - - ret - } - - pub fn succeed(&mut self, result: &SimilarResult) { - let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - - self.time_spent.push(*processing_time_ms as usize); - } -} - -impl Aggregate for SimilarAggregator { - fn event_name(&self) -> &'static str { - Method::event_name() - } - - /// Aggregate one [SimilarAggregator] into another. - fn aggregate(mut self: Box, new: Box) -> Box { - let Self { - total_received, - total_succeeded, - mut time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - marker: _, - } = *new; - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(&mut time_spent); - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - self.retrieve_vectors |= retrieve_vectors; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - self - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - marker: _, - } = *self; - - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - json!({ - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "vector": { - "retrieve_vectors": retrieve_vectors, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - } - }) - } -} diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index c8183186d..7d073ec5f 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -28,9 +28,11 @@ use crate::Opt; pub mod documents; pub mod facet_search; pub mod search; +mod search_analytics; pub mod settings; mod settings_analytics; pub mod similar; +mod similar_analytics; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index ac6e23c8f..2f5cb4a36 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -13,13 +13,13 @@ use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; -use crate::analytics::segment_analytics::{SearchGET, SearchPOST}; -use crate::analytics::{Analytics, SearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST}; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, diff --git a/meilisearch/src/routes/indexes/search_analytics.rs b/meilisearch/src/routes/indexes/search_analytics.rs new file mode 100644 index 000000000..8bbb1781f --- /dev/null +++ b/meilisearch/src/routes/indexes/search_analytics.rs @@ -0,0 +1,485 @@ +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; +use std::collections::{BTreeSet, BinaryHeap, HashMap}; + +use meilisearch_types::locales::Locale; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{ + SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEMANTIC_RATIO, + }, +}; + +aggregate_methods!( + SearchGET => "Documents Searched GET", + SearchPOST => "Documents Searched POST", +); + +#[derive(Default)] +pub struct SearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + total_degraded: usize, + total_used_negative_operator: usize, + time_spent: BinaryHeap, + + // sort + sort_with_geo_point: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + sort_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + sort_total_number_of_criteria: usize, + + // distinct + distinct: bool, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // attributes_to_search_on + // every time a search is done using attributes_to_search_on + attributes_to_search_on_total_number_of_uses: usize, + + // q + // The maximum number of terms in a q request + max_terms_number: usize, + + // vector + // The maximum number of floats in a vector request + max_vector_size: usize, + // Whether the semantic ratio passed to a hybrid search equals the default ratio. + semantic_ratio: bool, + hybrid: bool, + retrieve_vectors: bool, + + // every time a search is done, we increment the counter linked to the used settings + matching_strategy: HashMap, + + // List of the unique Locales passed as parameter + locales: BTreeSet, + + // pagination + max_limit: usize, + max_offset: usize, + finite_pagination: usize, + + // formatting + max_attributes_to_retrieve: usize, + max_attributes_to_highlight: usize, + highlight_pre_tag: bool, + highlight_post_tag: bool, + max_attributes_to_crop: usize, + crop_marker: bool, + show_matches_position: bool, + crop_length: bool, + + // facets + facets_sum_of_terms: usize, + facets_total_number_of_facets: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SearchQuery) -> Self { + let SearchQuery { + q, + vector, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve: _, + retrieve_vectors, + attributes_to_crop: _, + crop_length, + attributes_to_highlight: _, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets: _, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref sort) = sort { + ret.sort_total_number_of_criteria = 1; + ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); + ret.sort_sum_of_criteria_terms = sort.len(); + } + + ret.distinct = distinct.is_some(); + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + // attributes_to_search_on + if attributes_to_search_on.is_some() { + ret.attributes_to_search_on_total_number_of_uses = 1; + } + + if let Some(ref q) = q { + ret.max_terms_number = q.split_whitespace().count(); + } + + if let Some(ref vector) = vector { + ret.max_vector_size = vector.len(); + } + ret.retrieve_vectors |= retrieve_vectors; + + if query.is_finite_pagination() { + let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); + ret.max_limit = limit; + ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; + ret.finite_pagination = 1; + } else { + ret.max_limit = *limit; + ret.max_offset = *offset; + ret.finite_pagination = 0; + } + + ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); + + if let Some(locales) = locales { + ret.locales = locales.iter().copied().collect(); + } + + ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); + ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); + ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); + ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); + ret.show_matches_position = *show_matches_position; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + if let Some(hybrid) = hybrid { + ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); + ret.hybrid = true; + } + + ret + } + + pub fn succeed(&mut self, result: &SearchResult) { + let SearchResult { + hits: _, + query: _, + processing_time_ms, + hits_info: _, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + degraded, + used_negative_operator, + } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + if *degraded { + self.total_degraded = self.total_degraded.saturating_add(1); + } + if *used_negative_operator { + self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); + } + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SearchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + mut locales, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_degraded = self.total_degraded.saturating_add(total_degraded); + self.total_used_negative_operator = + self.total_used_negative_operator.saturating_add(total_used_negative_operator); + self.time_spent.append(&mut time_spent); + + // sort + self.sort_with_geo_point |= sort_with_geo_point; + self.sort_sum_of_criteria_terms = + self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); + self.sort_total_number_of_criteria = + self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + + // distinct + self.distinct |= distinct; + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + // attributes_to_search_on + self.attributes_to_search_on_total_number_of_uses = self + .attributes_to_search_on_total_number_of_uses + .saturating_add(attributes_to_search_on_total_number_of_uses); + + // q + self.max_terms_number = self.max_terms_number.max(max_terms_number); + + // vector + self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; + self.semantic_ratio |= semantic_ratio; + self.hybrid |= hybrid; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + self.finite_pagination += finite_pagination; + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + self.max_attributes_to_highlight = + self.max_attributes_to_highlight.max(max_attributes_to_highlight); + self.highlight_pre_tag |= highlight_pre_tag; + self.highlight_post_tag |= highlight_post_tag; + self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); + self.crop_marker |= crop_marker; + self.show_matches_position |= show_matches_position; + self.crop_length |= crop_length; + + // facets + self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); + self.facets_total_number_of_facets = + self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); + + // matching strategy + for (key, value) in matching_strategy.into_iter() { + let matching_strategy = self.matching_strategy.entry(key).or_insert(0); + *matching_strategy = matching_strategy.saturating_add(value); + } + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + // locales + self.locales.append(&mut locales); + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + locales, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, + }, + "sort": { + "with_geoPoint": sort_with_geo_point, + "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), + }, + "distinct": distinct, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "attributes_to_search_on": { + "total_number_of_uses": attributes_to_search_on_total_number_of_uses, + }, + "q": { + "max_terms_number": max_terms_number, + }, + "vector": { + "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, + }, + "hybrid": { + "enabled": hybrid, + "semantic_ratio": semantic_ratio, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + "max_attributes_to_highlight": max_attributes_to_highlight, + "highlight_pre_tag": highlight_pre_tag, + "highlight_post_tag": highlight_post_tag, + "max_attributes_to_crop": max_attributes_to_crop, + "crop_marker": crop_marker, + "show_matches_position": show_matches_position, + "crop_length": crop_length, + }, + "facets": { + "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), + }, + "matching_strategy": { + "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "locales": locales, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + }, + }) + } +} diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 33df6bdad..79f42f0aa 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -13,10 +13,10 @@ use serde_json::Value; use tracing::debug; use super::ActionPolicy; -use crate::analytics::segment_analytics::{SimilarGET, SimilarPOST}; -use crate::analytics::{Analytics, SimilarAggregator}; +use crate::analytics::Analytics; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST}; use crate::search::{ add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, diff --git a/meilisearch/src/routes/indexes/similar_analytics.rs b/meilisearch/src/routes/indexes/similar_analytics.rs new file mode 100644 index 000000000..69685a56c --- /dev/null +++ b/meilisearch/src/routes/indexes/similar_analytics.rs @@ -0,0 +1,235 @@ +use std::collections::{BinaryHeap, HashMap}; + +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{SimilarQuery, SimilarResult}, +}; + +aggregate_methods!( + SimilarPOST => "Similar POST", + SimilarGET => "Similar GET", +); + +#[derive(Default)] +pub struct SimilarAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // Whether a non-default embedder was specified + retrieve_vectors: bool, + + // pagination + max_limit: usize, + max_offset: usize, + + // formatting + max_attributes_to_retrieve: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SimilarAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SimilarQuery) -> Self { + let SimilarQuery { + id: _, + embedder: _, + offset, + limit, + attributes_to_retrieve: _, + retrieve_vectors, + show_ranking_score, + show_ranking_score_details, + filter, + ranking_score_threshold, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + ret.max_limit = *limit; + ret.max_offset = *offset; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + ret.retrieve_vectors = *retrieve_vectors; + + ret + } + + pub fn succeed(&mut self, result: &SimilarResult) { + let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SimilarAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + /// Aggregate one [SimilarAggregator] into another. + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.time_spent.append(&mut time_spent); + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + self.retrieve_vectors |= retrieve_vectors; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + } + }) + } +} diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index c25aeee70..b7260ea08 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -25,6 +25,7 @@ pub mod indexes; mod logs; mod metrics; mod multi_search; +mod multi_search_analytics; mod snapshot; mod swap_indexes; pub mod tasks; diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 13a39cb44..b7bd31716 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -9,7 +9,7 @@ use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; -use crate::analytics::{Analytics, MultiSearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -21,6 +21,8 @@ use crate::search::{ }; use crate::search_queue::SearchQueue; +use super::multi_search_analytics::MultiSearchAggregator; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post)))); } diff --git a/meilisearch/src/routes/multi_search_analytics.rs b/meilisearch/src/routes/multi_search_analytics.rs new file mode 100644 index 000000000..be1218399 --- /dev/null +++ b/meilisearch/src/routes/multi_search_analytics.rs @@ -0,0 +1,170 @@ +use std::collections::HashSet; + +use serde_json::json; + +use crate::{ + analytics::Aggregate, + search::{FederatedSearch, SearchQueryWithIndex}, +}; + +#[derive(Default)] +pub struct MultiSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + + // sum of the number of distinct indexes in each single request, use with total_received to compute an avg + total_distinct_index_count: usize, + // number of queries with a single index, use with total_received to compute a proportion + total_single_index: usize, + + // sum of the number of search queries in the requests, use with total_received to compute an average + total_search_count: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + + // federation + use_federation: bool, +} + +impl MultiSearchAggregator { + pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { + let use_federation = federated_search.federation.is_some(); + + let distinct_indexes: HashSet<_> = federated_search + .queries + .iter() + .map(|query| { + let query = &query; + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + let SearchQueryWithIndex { + index_uid, + federation_options: _, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } = query; + + index_uid.as_str() + }) + .collect(); + + let show_ranking_score = + federated_search.queries.iter().any(|query| query.show_ranking_score); + let show_ranking_score_details = + federated_search.queries.iter().any(|query| query.show_ranking_score_details); + + Self { + total_received: 1, + total_succeeded: 0, + total_distinct_index_count: distinct_indexes.len(), + total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, + total_search_count: federated_search.queries.len(), + show_ranking_score, + show_ranking_score_details, + use_federation, + } + } + + pub fn succeed(&mut self) { + self.total_succeeded = self.total_succeeded.saturating_add(1); + } +} + +impl Aggregate for MultiSearchAggregator { + fn event_name(&self) -> &'static str { + "Documents Searched by Multi-Search POST" + } + + /// Aggregate one [MultiSearchAggregator] into another. + fn aggregate(self: Box, new: Box) -> Box { + // write the aggregate in a way that will cause a compilation error if a field is added. + + // get ownership of self, replacing it by a default value. + let this = *self; + + let total_received = this.total_received.saturating_add(new.total_received); + let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); + let total_distinct_index_count = + this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_single_index = this.total_single_index.saturating_add(new.total_single_index); + let total_search_count = this.total_search_count.saturating_add(new.total_search_count); + let show_ranking_score = this.show_ranking_score || new.show_ranking_score; + let show_ranking_score_details = + this.show_ranking_score_details || new.show_ranking_score_details; + let use_federation = this.use_federation || new.use_federation; + + Box::new(Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + } = *self; + + json!({ + "requests": { + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "indexes": { + "total_single_index": total_single_index, + "total_distinct_index_count": total_distinct_index_count, + "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early + }, + "searches": { + "total_search_count": total_search_count, + "avg_search_count": (total_search_count as f64) / (total_received as f64), + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + }, + "federation": { + "use_federation": use_federation, + } + }) + } +}